crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,456 @@
1
+ /**
2
+ * This is a very quick and dirty frontend for just interacting with the models.
3
+ * Please refrain from adding additional functionality to this.
4
+ * TODO: Write this in React.
5
+ */
6
+ $(function () {
7
+ const urlParams = decodeUrlParams(window.location.search);
8
+ let auth = null;
9
+
10
+ function censor(api_key) {
11
+ // Show only the first k letters
12
+ const k = 2;
13
+ if (api_key.length <= k) {
14
+ return api_key;
15
+ }
16
+ return api_key.substring(0, k) + "*".repeat(api_key.length - k);
17
+ }
18
+
19
+ // Logging in and out
20
+ function updateLogin() {
21
+ const $loginInfo = $("#loginInfo");
22
+ $loginInfo.empty();
23
+ let api_key = readCookie("api_key");
24
+ if (api_key) {
25
+ auth = { api_key };
26
+ $loginInfo.append(
27
+ $("<a>", { class: "nav-link", href: "#" })
28
+ .append("Logout of API key " + censor(api_key))
29
+ .click(() => {
30
+ eraseCookie("api_key");
31
+ updateLogin();
32
+ }),
33
+ );
34
+ } else {
35
+ auth = null;
36
+ $loginInfo.append(
37
+ $("<a>", { class: "nav-link", href: "#" })
38
+ .append("Login")
39
+ .click(() => {
40
+ api_key = prompt("Enter your API key:");
41
+ if (!api_key) {
42
+ return;
43
+ }
44
+
45
+ // Check the API key the user entered using endpoint /api/account
46
+ const args = { auth: JSON.stringify({ api_key }) };
47
+ $.getJSON("/api/account", args, (response) => {
48
+ console.log("/api/account", response);
49
+ if ("error" in response) {
50
+ alert("The API key you've entered is invalid. Try again.");
51
+ } else {
52
+ createCookie("api_key", api_key);
53
+ updateLogin();
54
+ }
55
+ });
56
+ }),
57
+ );
58
+ }
59
+ }
60
+
61
+ updateLogin();
62
+
63
+ ////////////////////////////////////////////////////////////
64
+ // Rendering functions
65
+
66
+ function renderExampleQueries(updateQuery) {
67
+ // Show links for each example query, so when you click on them, they populate the textboxes.
68
+ const $examplesBlock = $("<div>", { class: "examples-block" });
69
+ $examplesBlock.append($("<span>").append("Examples:"));
70
+ generalInfo.example_queries.forEach((query, i) => {
71
+ const href = "#";
72
+ const title =
73
+ "[Prompt]\n" +
74
+ query.prompt +
75
+ "\n[Settings]\n" +
76
+ query.settings +
77
+ "\n[Environments]\n" +
78
+ query.environments;
79
+ const $link = $("<a>", { href, title }).append(`[${i}]`);
80
+ $link.click(() => {
81
+ // Populate the query with the example
82
+ updateQuery(query);
83
+ urlParams.prompt = query.prompt;
84
+ urlParams.settings = query.settings;
85
+ urlParams.environments = query.environments;
86
+ updateBrowserLocation(urlParams);
87
+ });
88
+ $examplesBlock.append("&nbsp;");
89
+ $examplesBlock.append($link);
90
+ });
91
+ return $examplesBlock;
92
+ }
93
+
94
+ function renderQuery(handleQueryResult) {
95
+ // Render the textboxes for entering the query (which includes the prompt, settings, and environment)
96
+ const $queryBlock = $("<div>", { class: "block" });
97
+ const $prompt = $("<textarea>", {
98
+ cols: 90,
99
+ rows: 7,
100
+ placeholder: "Enter prompt",
101
+ }).val(urlParams.prompt);
102
+ const $settings = $("<textarea>", {
103
+ cols: 90,
104
+ rows: 5,
105
+ placeholder:
106
+ "Enter settings (e.g., model: openai/text-davinci-002 for Instruct GPT-3); click Help at the top to learn more",
107
+ }).val(urlParams.settings);
108
+ const $environments = $("<textarea>", {
109
+ cols: 90,
110
+ rows: 3,
111
+ placeholder:
112
+ "Enter environment variables (e.g., city: [Boston, New York]); click Help at the top to learn more",
113
+ }).val(urlParams.environments);
114
+
115
+ $queryBlock.data("prompt", $prompt);
116
+ $queryBlock.data("settings", $settings);
117
+ $queryBlock.data("environments", $environments);
118
+
119
+ function bindSubmit($text) {
120
+ $text.keypress((e) => {
121
+ if ((event.keyCode === 10 || event.keyCode === 13) && event.ctrlKey) {
122
+ submit();
123
+ }
124
+ });
125
+ }
126
+
127
+ bindSubmit($prompt);
128
+ bindSubmit($settings);
129
+ bindSubmit($environments);
130
+
131
+ function submit() {
132
+ if (!auth) {
133
+ alert("You must log in first.");
134
+ return;
135
+ }
136
+
137
+ const query = {
138
+ prompt: $prompt.val(),
139
+ settings: $settings.val(),
140
+ environments: $environments.val(),
141
+ };
142
+
143
+ urlParams.prompt = query.prompt;
144
+ urlParams.settings = query.settings;
145
+ urlParams.environments = query.environments;
146
+ updateBrowserLocation(urlParams);
147
+
148
+ $.getJSON("/api/query", query, handleQueryResult);
149
+ }
150
+
151
+ // Show examples of queries
152
+ const $exampleQueries = renderExampleQueries((query) => {
153
+ $queryBlock.data("prompt").val(query.prompt);
154
+ $queryBlock.data("settings").val(query.settings);
155
+ $queryBlock.data("environments").val(query.environments);
156
+ urlParams.prompt = query.prompt;
157
+ urlParams.settings = query.settings;
158
+ urlParams.environments = query.environments;
159
+ updateBrowserLocation();
160
+ });
161
+
162
+ const promptHelp =
163
+ "This is the text you feed into the language model to complete.\nExample:\n Life is like";
164
+ const settingsHelp =
165
+ "Specifies what information we want from the language model (see [Help] for more details).\nExample:\n temperature: ${temperature}\n model: openai/davinci\n max_tokens: 10\n num_completions: 5";
166
+ const environmentsHelp =
167
+ "Specifies a list of values to try for each variable that appears in the prompt or settings.\nExample:\n temperature: [0, 0.5, 1]";
168
+
169
+ const $promptLabel = $("<span>")
170
+ .append(helpIcon(promptHelp, "help.html#query"))
171
+ .append("Prompt");
172
+ const $settingsLabel = $("<span>")
173
+ .append(helpIcon(settingsHelp, "help.html#query"))
174
+ .append("Settings");
175
+ const $environmentsLabel = $("<span>")
176
+ .append(helpIcon(environmentsHelp, "help.html#query"))
177
+ .append("Environments");
178
+
179
+ $queryBlock.append($("<h4>").append("Query"));
180
+ $queryBlock.append($exampleQueries);
181
+ const $table = $("<table>", { class: "query-table" });
182
+ $table.append(
183
+ $("<tr>")
184
+ .append($("<td>").append($promptLabel))
185
+ .append($("<td>").append($prompt)),
186
+ );
187
+ $table.append(
188
+ $("<tr>")
189
+ .append($("<td>").append($settingsLabel))
190
+ .append($("<td>").append($settings)),
191
+ );
192
+ $table.append(
193
+ $("<tr>")
194
+ .append($("<td>").append($environmentsLabel))
195
+ .append($("<td>").append($environments)),
196
+ );
197
+ $queryBlock.append($table);
198
+ $queryBlock.append($("<button>").append("Submit").click(submit));
199
+
200
+ return $queryBlock;
201
+ }
202
+
203
+ function getChangingKeys(items) {
204
+ // Return the list of keys whose values vary across `items.`
205
+ if (items.length === 0) {
206
+ return [];
207
+ }
208
+ return Object.keys(items[0]).filter((key) => {
209
+ return !items.every(
210
+ (item) => JSON.stringify(item[key]) === JSON.stringify(items[0][key]),
211
+ );
212
+ });
213
+ }
214
+
215
+ function renderRequest(changingKeys, request) {
216
+ // Render the request metadata (e.g., temperature if it is changing)
217
+ const title = JSON.stringify(request);
218
+ // Always include model, never prompt (since that's shown right after).
219
+ const showKeys = ["model"].concat(
220
+ changingKeys.filter((key) => key !== "prompt" && key !== "model"),
221
+ );
222
+ const summary =
223
+ "[" + showKeys.map((key) => key + ":" + request[key]).join(", ") + "]";
224
+ return $("<div>", { title }).append(
225
+ summary + " " + multilineHtml(request.prompt),
226
+ );
227
+ }
228
+
229
+ function renderTime(time) {
230
+ return Math.round(time * 10) / 10 + "s";
231
+ }
232
+
233
+ function constructTokenGroups(tokens) {
234
+ // Note: sometimes multiple tokens correspond to one character, for example:
235
+ // ["bytes:\xe2\x80", "bytes:\x99"] => ’
236
+ // For these, we keep these in the buffer and collapse them, and concatenate the entries.
237
+ //
238
+ // get_num_bytes() and convert_tokens_to_text() in src/helm/benchmark/basic_metrics.py are adapted from this function.
239
+ const groups = [];
240
+ for (let i = 0; i < tokens.length;) {
241
+ // Aggregate consecutive tokens while they're "bytes:..."
242
+ const group = { tokens: [] };
243
+ if (tokens[i].text.startsWith("bytes:")) {
244
+ let bytestring = "";
245
+ while (i < tokens.length && tokens[i].text.startsWith("bytes:")) {
246
+ group.tokens.push(tokens[i]);
247
+ // Extract part after : (e.g., \xe2\x80)
248
+ bytestring += tokens[i].text.split(":")[1];
249
+ i++;
250
+ }
251
+ // Convert to encoded URI (e.g., %e2%80%99) and decode
252
+ group.text = decodeURIComponent(bytestring.replaceAll("\\x", "%"));
253
+ } else {
254
+ group.tokens.push(tokens[i]);
255
+ group.text = tokens[i].text;
256
+ i++;
257
+ }
258
+ groups.push(group);
259
+ }
260
+ return groups;
261
+ }
262
+
263
+ function renderTokens(tokens) {
264
+ // Render text as a sequence of tokens that you can interact with to see more information (e.g., logprobs)
265
+ const $result = $("<div>");
266
+ const groups = constructTokenGroups(tokens);
267
+ for (const group of groups) {
268
+ const $group = $("<span>", { class: "token" }).append(
269
+ multilineHtml(group.text),
270
+ );
271
+ $result.append($group);
272
+ }
273
+ return $result;
274
+ }
275
+
276
+ function renderRequestResult(requestResult) {
277
+ // Render the list of completions.
278
+ if (requestResult.error) {
279
+ return renderError(requestResult.error);
280
+ }
281
+ const $result = $("<div>");
282
+ requestResult.completions.forEach((completion) => {
283
+ const $contents = $("<span>", {
284
+ title: `logprob: ${completion.logprob}`,
285
+ }).append(renderTokens(completion.tokens));
286
+ const $metadata = $("<span>", { class: "metadata" });
287
+ $metadata.append(
288
+ $("<span>", { title: "Log probability" }).append(
289
+ round(completion.logprob, 2),
290
+ ),
291
+ );
292
+ if (completion.finish_reason) {
293
+ const title =
294
+ "Generation finished because of this reason: " +
295
+ JSON.stringify(completion.finish_reason);
296
+ $metadata
297
+ .append(" ")
298
+ .append(
299
+ $("<span>", { title }).append(completion.finish_reason.reason),
300
+ );
301
+ }
302
+ $result.append(
303
+ $("<div>", { class: "completion" }).append($metadata).append($contents),
304
+ );
305
+ });
306
+ $result.append($("<i>").append(renderTime(requestResult.request_time)));
307
+ return $result;
308
+ }
309
+
310
+ function renderAccount() {
311
+ // Render the account information (usage, quotas).
312
+ if (!auth) {
313
+ return null;
314
+ }
315
+
316
+ const $accountBlock = $("<div>", { class: "block" });
317
+ const args = { auth: JSON.stringify(auth) };
318
+ $.getJSON("/api/account", args, ([account]) => {
319
+ console.log("/api/account", account);
320
+ const items = [];
321
+ for (modelGroup in account.usages) {
322
+ for (granularity in account.usages[modelGroup]) {
323
+ const usage = account.usages[modelGroup][granularity];
324
+ // Only print out usage for model groups and granularities where there is a quota
325
+ if (usage.quota) {
326
+ const percent = Math.round((usage.used / usage.quota) * 100);
327
+ items.push(
328
+ `<b>${modelGroup}</b>: ${usage.period} (${usage.used} / ${usage.quota} = ${percent}%)`,
329
+ );
330
+ }
331
+ }
332
+ }
333
+ if (items.length === 0) {
334
+ items.push("no restrictions");
335
+ }
336
+ $accountBlock
337
+ .empty()
338
+ .append(
339
+ helpIcon(
340
+ "Specifies your usage/quota (321/10000) for each model group (e.g., gpt3) for the current period (e.g., 2022-1-2).",
341
+ "help.html#quotas",
342
+ ),
343
+ )
344
+ .append("Usage")
345
+ .append(": ")
346
+ .append(items.join(" | "));
347
+ });
348
+ return $accountBlock;
349
+ }
350
+
351
+ ////////////////////////////////////////////////////////////
352
+ // For index.html
353
+
354
+ function renderQueryInterface() {
355
+ // For index.html
356
+ const $accountBlock = $("<div>").append(renderAccount());
357
+
358
+ // Allow editing the query
359
+ const $queryBlock = renderQuery((queryResult) => {
360
+ // Create requests
361
+ console.log("/api/query", queryResult);
362
+ $requestsBlock.empty();
363
+
364
+ if (queryResult.error) {
365
+ $requestsBlock.append(renderError(queryResult.error));
366
+ return;
367
+ }
368
+
369
+ $requestsBlock.append(
370
+ $("<h4>").append(`Requests (${queryResult.requests.length})`),
371
+ );
372
+ if (queryResult.error) {
373
+ $requestsBlock.append(renderError(queryResult.error));
374
+ return;
375
+ }
376
+ const changingKeys = getChangingKeys(queryResult.requests);
377
+ queryResult.requests.forEach((request) => {
378
+ const $request = $("<div>", { class: "request" }).append(
379
+ renderRequest(changingKeys, request),
380
+ );
381
+ const $requestResult = $("<div>").append($("<i>").append("(waiting)"));
382
+ const args = {
383
+ auth: JSON.stringify(auth),
384
+ request: JSON.stringify(request),
385
+ };
386
+ $.getJSON("/api/request", args, (requestResult) => {
387
+ console.log("/api/request", request, requestResult);
388
+ $requestResult.empty().append(renderRequestResult(requestResult));
389
+ if (!requestResult.cached) {
390
+ $accountBlock.empty().append(renderAccount());
391
+ }
392
+ });
393
+ $request.append($requestResult);
394
+ $requestsBlock.append($request);
395
+ });
396
+ });
397
+
398
+ // Where the requests and responses come in
399
+ const $requestsBlock = $("<div>", { class: "block" });
400
+
401
+ const $group = $("<div>");
402
+ $group.append($accountBlock);
403
+ $group.append($queryBlock);
404
+ $group.append($requestsBlock);
405
+ return $group;
406
+ }
407
+
408
+ ////////////////////////////////////////////////////////////
409
+ // For help.html
410
+
411
+ function renderModelsTable() {
412
+ // Render the list of models
413
+ const $table = $("<table>", { class: "table" });
414
+ const $header = $("<tr>")
415
+ .append($("<td>").append("group"))
416
+ .append($("<td>").append("name"))
417
+ .append($("<td>").append("description"))
418
+ .append($("<td>").append("tags"));
419
+ $table.append($header);
420
+ generalInfo.all_models.forEach((model) => {
421
+ const $row = $("<tr>")
422
+ .append($("<td>").append($("<tt>").append(model.group)))
423
+ .append($("<td>").append($("<tt>").append(model.name)))
424
+ .append($("<td>").append(model.description))
425
+ .append($("<td>").append(model.tags.join(" ")));
426
+ $table.append($row);
427
+ });
428
+ return $table;
429
+ }
430
+
431
+ ////////////////////////////////////////////////////////////
432
+ // Main
433
+
434
+ let generalInfo;
435
+
436
+ $.getJSON("/api/general_info", (response) => {
437
+ generalInfo = response;
438
+ console.log("/api/general_info", generalInfo);
439
+ if (generalInfo.error) {
440
+ alert(generalInfo.error);
441
+ return;
442
+ }
443
+
444
+ // For index.html
445
+ const $main = $("#main");
446
+ if ($main.length > 0) {
447
+ $main.empty().append(renderQueryInterface());
448
+ }
449
+
450
+ // For help.html
451
+ const $helpModels = $("#help-models");
452
+ if ($helpModels.length > 0) {
453
+ $helpModels.empty().append(renderModelsTable());
454
+ }
455
+ });
456
+ });
Binary file
helm/proxy/test_retry.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from retrying import RetryError
2
2
 
3
3
  from helm.common.request import RequestResult
4
- from .retry import retry_request, get_retry_decorator, retry_if_request_failed
4
+ from helm.proxy.retry import retry_request, get_retry_decorator, retry_if_request_failed
5
5
 
6
6
 
7
7
  def test_retry_for_successful_request():
@@ -4,7 +4,7 @@ from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_
4
4
  from helm.common.request import Request, GeneratedOutput
5
5
  from helm.tokenizers.auto_tokenizer import AutoTokenizer
6
6
  from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
7
- from .token_counter import TokenCounter
7
+ from helm.proxy.token_counters.token_counter import TokenCounter
8
8
 
9
9
 
10
10
  class AutoTokenCounter(TokenCounter):
@@ -10,7 +10,7 @@ from helm.common.tokenization_request import (
10
10
  DecodeRequest,
11
11
  TokenizationToken,
12
12
  )
13
- from .caching_tokenizer import CachingTokenizer
13
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
14
14
 
15
15
  try:
16
16
  from aleph_alpha_client import Client as AlephAlphaPythonClient
@@ -1,6 +1,6 @@
1
1
  from abc import abstractmethod
2
2
  from dataclasses import asdict
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List
4
4
 
5
5
  from helm.common.cache import Cache, CacheConfig
6
6
  from helm.common.request import wrap_request_time
@@ -11,7 +11,7 @@ from helm.common.tokenization_request import (
11
11
  DecodeRequestResult,
12
12
  TokenizationToken,
13
13
  )
14
- from .tokenizer import Tokenizer
14
+ from helm.tokenizers.tokenizer import Tokenizer
15
15
 
16
16
 
17
17
  class CachingTokenizer(Tokenizer):
@@ -153,31 +153,3 @@ class CachingTokenizer(Tokenizer):
153
153
  )
154
154
  except Exception as error:
155
155
  raise ValueError(f"Failed to decode tokens with {self.__class__.__name__} tokenizer: {error}") from error
156
-
157
-
158
- def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
159
- """
160
- Certain tokenizers introduce special characters to represent spaces, such as
161
- "Ġ" or "▁". This function removes those characters.
162
- """
163
- if tokenizer_name in [
164
- "TsinghuaKEG/ice",
165
- "bigscience/T0pp",
166
- "google/t5-11b",
167
- "google/flan-t5-xxl",
168
- "google/ul2",
169
- "Yandex/yalm",
170
- "ai21/j1",
171
- "together",
172
- ]:
173
- return token.replace("▁", " ")
174
- elif tokenizer_name is not None and tokenizer_name.startswith("huggingface"):
175
- return token.replace("Ġ", " ")
176
- return token
177
-
178
-
179
- def cleanup_tokens(tokens: List[str], tokenizer_name: Optional[str] = None) -> List[str]:
180
- """
181
- Applies `cleanup_str` to each token in `tokens`.
182
- """
183
- return [cleanup_str(token, tokenizer_name) for token in tokens]
@@ -11,7 +11,7 @@ from helm.common.tokenization_request import (
11
11
  TokenizationRequestResult,
12
12
  TokenizationToken,
13
13
  )
14
- from .tokenizer import Tokenizer
14
+ from helm.tokenizers.tokenizer import Tokenizer
15
15
 
16
16
  import requests
17
17
 
@@ -7,8 +7,8 @@ from helm.common.concurrency import ThreadSafeWrapper
7
7
  from transformers import AutoTokenizer, PreTrainedTokenizerBase
8
8
 
9
9
  from helm.common.hierarchical_logger import htrack_block, hlog
10
- from .caching_tokenizer import CachingTokenizer
11
- from .tokenizer import cleanup_tokens
10
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
11
+ from helm.tokenizers.tokenizer import cleanup_tokens
12
12
 
13
13
 
14
14
  WrappedPreTrainedTokenizer = ThreadSafeWrapper[PreTrainedTokenizerBase]
@@ -5,7 +5,7 @@ import torch
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.optional_dependencies import OptionalDependencyNotInstalled
8
- from .caching_tokenizer import CachingTokenizer
8
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
9
9
 
10
10
  try:
11
11
  from lit_gpt import Tokenizer as InternalTokenizer
@@ -10,7 +10,7 @@ from helm.common.tokenization_request import (
10
10
  TokenizationRequest,
11
11
  TokenizationRequestResult,
12
12
  )
13
- from helm.tokenizers.anthropic_tokenizer import AnthropicTokenizer
13
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
14
14
 
15
15
 
16
16
  class TestAnthropicTokenizer:
@@ -21,7 +21,11 @@ class TestAnthropicTokenizer:
21
21
  def setup_method(self, method):
22
22
  cache_file = tempfile.NamedTemporaryFile(delete=False)
23
23
  self.cache_path: str = cache_file.name
24
- self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path))
24
+ self.tokenizer = HuggingFaceTokenizer(
25
+ SqliteCacheConfig(self.cache_path),
26
+ tokenizer_name="anthropic/claude",
27
+ pretrained_model_name_or_path="Xenova/claude-tokenizer",
28
+ )
25
29
 
26
30
  def teardown_method(self, method):
27
31
  os.remove(self.cache_path)
@@ -10,7 +10,7 @@ from helm.common.tokenization_request import (
10
10
  TokenizationRequest,
11
11
  TokenizationRequestResult,
12
12
  )
13
- from .huggingface_tokenizer import HuggingFaceTokenizer
13
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
14
14
 
15
15
 
16
16
  class TestHuggingFaceGPT2Tokenizer:
@@ -10,7 +10,7 @@ from helm.common.tokenization_request import (
10
10
  TokenizationRequest,
11
11
  TokenizationRequestResult,
12
12
  )
13
- from .yalm_tokenizer import YaLMTokenizer
13
+ from helm.tokenizers.yalm_tokenizer import YaLMTokenizer
14
14
 
15
15
 
16
16
  class TestYaLMTokenizer:
@@ -1,7 +1,7 @@
1
1
  from typing import Any, Dict
2
2
 
3
3
  from helm.common.optional_dependencies import handle_module_not_found_error
4
- from .caching_tokenizer import CachingTokenizer
4
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
5
5
 
6
6
  try:
7
7
  import tiktoken
@@ -41,7 +41,9 @@ def cleanup_str(token: str, tokenizer_name: Optional[str] = None) -> str:
41
41
  "together",
42
42
  ]:
43
43
  return token.replace("▁", " ")
44
- elif tokenizer_name is not None and tokenizer_name.startswith("huggingface"):
44
+ elif tokenizer_name is not None and (
45
+ tokenizer_name.startswith("huggingface") or tokenizer_name == "anthropic/claude"
46
+ ):
45
47
  return token.replace("Ġ", " ")
46
48
  return token
47
49
 
@@ -1,9 +1,9 @@
1
1
  from typing import Any, Dict
2
2
 
3
3
  from helm.common.cache import CacheConfig
4
- from .caching_tokenizer import CachingTokenizer
5
- from .tokenizer import cleanup_tokens
6
- from .yalm_tokenizer_data.yalm_tokenizer import YaLMTokenizer as YaLMTokenizerInternal
4
+ from helm.tokenizers.caching_tokenizer import CachingTokenizer
5
+ from helm.tokenizers.tokenizer import cleanup_tokens
6
+ from helm.tokenizers.yalm_tokenizer_data.yalm_tokenizer import YaLMTokenizer as YaLMTokenizerInternal
7
7
 
8
8
 
9
9
  class YaLMTokenizer(CachingTokenizer):
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
 
3
3
  from helm.common.general import singleton
4
- from .yalm_tokenizer import YaLMTokenizer
4
+ from helm.tokenizers.yalm_tokenizer_data.yalm_tokenizer import YaLMTokenizer
5
5
 
6
6
 
7
7
  class TestYaLMTokenizer: