crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -3,6 +3,7 @@ import os
3
3
  import random
4
4
  from typing import List, Dict, Tuple
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Scenario,
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  DEFAULT_TEST_SIZE,
15
16
  PassageQuestionInput,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
  AMBIGUOUS_TAG = "ambiguous"
@@ -237,3 +239,16 @@ class BBQScenario(Scenario):
237
239
  instances.append(instance)
238
240
 
239
241
  return instances
242
+
243
+ def get_metadata(self) -> ScenarioMetadata:
244
+ return ScenarioMetadata(
245
+ name="bbq",
246
+ display_name="BBQ (Bias Benchmark for Question Answering)",
247
+ short_display_name="BBQ",
248
+ description="The Bias Benchmark for Question Answering (BBQ) for measuring social bias in "
249
+ "question answering in ambiguous and unambigous context [(Parrish et al., "
250
+ "2022)](https://aclanthology.org/2022.findings-acl.165/).",
251
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
252
+ main_metric="bbq_accuracy",
253
+ main_split="test",
254
+ )
@@ -0,0 +1,473 @@
1
+ rules:
2
+ - category: Root
3
+ tags: [personal]
4
+ expansions:
5
+ # https://gridfiti.com/best-chatgpt-prompts/#personal
6
+ - text: Act as a chef. Write recipes for an ${Language} three-course meal I can cook for date night.
7
+ - text: Write a casual message in ${Language} to my Airbnb host saying I’m going to be a little late to check-in and that I will arrive at 4pm.
8
+ - text: Write a formal complaint email to United Airlines about my delayed bag from my flight on Tuesday January 17th from New York to Los Angeles.
9
+ - text: "Summarize this article into bullet points: ${Article}"
10
+ - text: Act as a European travel agent. Come up with a 14-day itinerary for a trip to Germany. The first suggested attraction should be “Take a tour of the Reichstag Building in Berlin.”
11
+ - text: Write a letter of resignation to my employer. The reason for my resignation is that I need a more flexible schedule due to family issues.
12
+ - text: What’s the best way to make new friends when moving to a new city?
13
+ - text: What’s the quickest way to get across Toronto during rush hour?
14
+ - text: "Translate the following text into ${Language}: ${Article}"
15
+ - text: List 5 of the best bars in ${City}.
16
+ - text: Act as a tailor. Pick an appropriate outfit for a ${Event}.
17
+ - text: Respond to this text message below from my mom ${Message}
18
+
19
+ - category: Language
20
+ expansions:
21
+ - text: Italian
22
+ - text: Greek
23
+ - text: Indian
24
+ - text: Chinese
25
+ - text: Thai
26
+
27
+ - category: Article
28
+ expansions:
29
+ - text: "A ship's wheel or boat's wheel is a device used aboard a water vessel to steer that vessel and control its course. Together with the rest of the steering mechanism, it forms part of the helm. It is connected to a mechanical, electric servo, or hydraulic system which alters the horizontal angle of the vessel's rudder relative to its hull. In some modern ships the wheel is replaced with a simple toggle that remotely controls an electro-mechanical or electro-hydraulic drive for the rudder, with a rudder position indicator presenting feedback to the helmsman."
30
+
31
+ - category: City
32
+ expansions:
33
+ - text: Phoenix
34
+ - text: London
35
+
36
+ - category: Message
37
+ expansions:
38
+ - text: I tried calling yesterday, but I didn't get a response
39
+ - text: I love you
40
+
41
+ - category: Event
42
+ expansions:
43
+ - text: bar mitzvah
44
+ - text: wedding
45
+ - text: funeral
46
+ - text: baby shower
47
+
48
+ - category: Root
49
+ tags: [funny]
50
+ expansions:
51
+ # https://gridfiti.com/best-chatgpt-prompts/#funny
52
+ - text: What’s the best prank to play on a friend?
53
+ - text: Send a pun-filled happy birthday message to my friend Alex.
54
+ - text: Make a joke about chickens.
55
+ - text: Write a parody song about the alphabet.
56
+ - text: What do you get when you cross a snowman and a vampire?
57
+ - text: Write a short story where a pencil is the main character.
58
+ - text: What would happen if dogs could talk?
59
+ - text: Write a fictional news headline about robots taking over the world.
60
+ - text: Create a silly dialogue between two inanimate objects.
61
+ - text: What would happen if the moon were made of cheese?
62
+
63
+ - category: Root
64
+ tags: [student]
65
+ expansions:
66
+ # https://gridfiti.com/best-chatgpt-prompts/#student
67
+ - text: Act as a college interviewer for a Business School. Help me come up with questions I should ask the interviewer at the end of the interview.
68
+ - text: Act as a tutor. I need help understanding how the quadratic formula works. Please describe it in easy-to-understand terms.
69
+ - text: How can I modify the Pomodoro technique to suit my method of study?
70
+ - text: "Explain possible meanings of this quote: ${Quote}"
71
+ - text: Can you devise practical ways to stay focused during long study sessions?
72
+ - text: Help me find a way to balance my studying and social life.
73
+ - text: Structure a 1,500-word essay on Max Planck’s quantum theory.
74
+ - text: Come up with 10 ways to improve memory and recall while studying for exams.
75
+ - text: List note-taking techniques for a chemistry lecture.
76
+ - text: Suggest 10 Chrome extensions for students designed to improve productivity while studying.
77
+
78
+ - category: Quote
79
+ expansions:
80
+ - text: "Happiness lies in the joy of achievement and the thrill of creative effort. -Franklin D. Roosevelt"
81
+ - text: "Invincibility lies in the defence; the possibility of victory in the attack. -Sun Tze"
82
+
83
+ - category: Root
84
+ tags: [marketing]
85
+ expansions:
86
+ # https://gridfiti.com/best-chatgpt-prompts/#marketing
87
+ - text: Write a personalized blog post promoting my latest WordPress theme bundle.
88
+ - text: How do I increase my Twitter followers?
89
+ - text: Generate content ideas for my SaaS company.
90
+ - text: Produce 50 hashtags
91
+ - text: Create a TikTok campaign plan for launching an exciting new low carb mac and cheese, aimed at Gen Z and millennial consumers.
92
+ - text: Suggest inexpensive ways I can promote my plumping business without using social media.
93
+ - text: Is investing in influencer marketers worth the cost?
94
+ - text: How can I grow our brand’s TikTok audience?
95
+ - text: How can I use YouTube to increase brand awareness?
96
+ - text: Write a product description for my latest set of landscape oil paintings of the Scottish Highlands.
97
+ - text: Generate high-ticket offerings for my online language course.
98
+ - text: How can I use TikTok to increase sales conversions?
99
+ - text: Write a minute-long script for an advertisement about new sneakers.
100
+
101
+ - category: Root
102
+ tags: [midjourney]
103
+ expansions:
104
+ # https://gridfiti.com/best-chatgpt-prompts/#midjourney
105
+ - text: "Write a good prompt for an image generation AI to make an image of this: ${ImageDescription}"
106
+ - text: Generate a detailed description of an AI-generated cityscape with a futuristic twist.
107
+ - text: Create an image description that describes a visually stunning setting that takes place in the year 3030.
108
+ - text: Design with words an abstract composition with a graphic, minimalist style.
109
+ - text: With distinct adjectives, create a visual with words that would encompass the feeling of being lost in life
110
+
111
+ - category: ImageDescription
112
+ expansions:
113
+ - text: a cute puppy
114
+ - text: a beautiful sunset
115
+
116
+ - category: Root
117
+ tags: [entrepreneur]
118
+ expansions:
119
+ # https://gridfiti.com/best-chatgpt-prompts/#entrepreneur
120
+ - text: Analyze the current state of ${Industry} and its trends, challenges, and opportunities, including relevant data and statistics. Provide a list of key players and a short and long-term industry forecast, and explain any potential impact of current events or future developments.
121
+ - text: Provide a step-by-step guide on creating a business plan, including key components, useful resources, and tips for success.
122
+ - text: Write a comprehensive and easy-to-understand explanation of different marketing strategies and their effectiveness for small businesses.
123
+ - text: Offer a comprehensive guide to small business financing options, including loans, grants, and equity financing.
124
+ - text: Provide a guide on managing finances for a small business, including budgeting, cash flow management, and tax considerations.
125
+ - text: Write an in-depth analysis of the current state of a specific industry and its potential for small business opportunities.
126
+ - text: Offer a detailed review of a specific software or tool for small business operations, such as accounting, project management, or CRM.
127
+ - text: Write a detailed explanation of the pros and cons of outsourcing vs in-house for small business operations.
128
+ - text: Offer an in-depth analysis of the current state of small business legislation and regulations and their impact on entrepreneurship.
129
+ - text: Provide a guide on networking and building partnerships as a small business owner.
130
+ - text: Present a list of valuable resources and organizations for small business support and growth.
131
+
132
+ - category: Industry
133
+ expansions:
134
+ - text: machine learning
135
+ - text: greentech
136
+
137
+ - category: Root
138
+ tags: [blogging]
139
+ expansions:
140
+ # https://gridfiti.com/best-chatgpt-prompts/#blogging
141
+ - text: Write a brief for a blog post about opening a Gumroad store.
142
+ - text: Generate 5 social media posts for my blog post on AppSumo.
143
+ - text: Pick five keywords for a blog post titled “10 ways to improve my photography skills.”
144
+ - text: Suggest engaging titles for a blog post about 1930s Art Deco architecture.
145
+ - text: "Generate user-friendly URLs for the domain ${DomainName} for these keywords: ${Keyword}."
146
+ - text: Create a content calendar with six blog titles, including the keyword ${Keyword}. Pick suitable publishing dates for each guide spread across May 2023.
147
+ - text: Write a creative outreach email for a guest post pitch for the keyword ${Keyword} for the domain ${DomainName}. Come up with 3 title ideas using the keyword.
148
+
149
+ - category: DomainName
150
+ expansions:
151
+ - text: alphabet-soup.com
152
+
153
+ - category: Keyword
154
+ expansions:
155
+ - text: allergies
156
+ - text: nutrition
157
+
158
+ - category: Root
159
+ tags: [creative]
160
+ expansions:
161
+ # https://gridfiti.com/best-chatgpt-prompts/#creative
162
+ - text: Write a scary short story about a man trapped in an abandoned house.
163
+ - text: Generate five synonyms for sublime.
164
+ - text: Write a backstory for a 55-year-old male character during the French Revolution.
165
+ - text: Write the first stanza of a poem about cabbages with an AABB rhyme scheme.
166
+ - text: Write hilarious fan fiction about the Twilight saga.
167
+ - text: How should I pace a science fiction novella about traveling to Saturn’s moon, Titan?
168
+ - text: Act as an 18th-century pirate. Describe what life was like on a pirate ship in Southeast Asia.
169
+ - text: Write the opening to a story from the point of view of a washing machine.
170
+ - text: How can I make a soliloquy engaging at the beginning of a play?
171
+ - text: Describe ways I can make a framed narrator relevant to a story.
172
+ - text: "Continue this dialogue between a store clerk and a police officer:\nA: Did you see anything suspicious yesterday afternoon?\nB: Yesterday was...Sunday, I don't remember anything out of the ordinary."
173
+
174
+ - category: Root
175
+ tags: [copywriting]
176
+ expansions:
177
+ # https://gridfiti.com/best-chatgpt-prompts/#copywriting
178
+ - text: Act as a copywriter. Write long-form copy for the Hard Rock Cafe in Macau promoting merchandise.
179
+ - text: Act as a copywriter. Write short-form copy for a billboard in Times Square promoting Wicked the Musical.
180
+ - text: How is short-form copywriting easier than long-form copywriting?
181
+ - text: How is copywriting different from SEO content writing?
182
+ - text: How does repetition improve short-form copywriting?
183
+ - text: What is the PAS formula? And give 3 examples of the PAS formula being used.
184
+ - text: Provide examples of successful copywriting campaigns that use repetition
185
+ - text: Give examples of newspaper headlines that grab the reader’s attention.
186
+ - text: How can I integrate copywriting into social media posts?
187
+ - text: List unusual copywriting techniques that I can use to create taglines.
188
+
189
+ - category: Root
190
+ tags: [health]
191
+ expansions:
192
+ # https://gridfiti.com/best-chatgpt-prompts/#health
193
+ - text: List the top 10 healthy foods to include in my diet.
194
+ - text: Develop a 30-day workout routine to help me lose 2 lbs a week.
195
+ - text: Act as a nutritionist. Help me devise 10 healthy meals that can be cooked in 30 minutes or less.
196
+ - text: Provide a guide on healthy nutrition for weight management and weight loss.
197
+ - text: Create a 1 month workout plan for me exercise my shoulder muscles.
198
+ - text: Explain the benefits of daily exercise and provide a sample workout plan.
199
+ - text: Write a comprehensive guide on managing stress and maintaining mental wellness.
200
+ - text: Provide a list of common sleep disorders and tips for improving sleep quality.
201
+ - text: Explain the different types of therapy and their effectiveness in treating mental health issues.
202
+ - text: Offer a detailed explanation of the benefits and risks of alternative medicine practices, such as acupuncture and herbal remedies.
203
+ - text: Write an in-depth analysis of the current state of the healthcare system and its impact on the general population.
204
+ - text: Offer a list of recommended resources for quitting smoking and managing addiction.
205
+ - text: Explain the importance of regular medical check-ups and preventive care.
206
+
207
+ - category: Root
208
+ tags: [event]
209
+ expansions:
210
+ # https://gridfiti.com/best-chatgpt-prompts/#event
211
+ - text: Create a checklist for event planning, including important tasks and deadlines.
212
+ - text: What are some creative ways to add personal touches to the seating at a wedding dinner?
213
+ - text: Provide a list of top event venues in ${City}, along with their capacities.
214
+ - text: Write a guide on event budgeting, including tips for saving money and avoiding common overspending pitfalls.
215
+ - text: List the best catering companies in a specific area, along with menu options and prices.
216
+ - text: Offer a comprehensive explanation of event marketing, including target audience analysis and promotion strategies.
217
+ - text: Explain the importance of wedding photography, including styles, techniques, and essential shots to capture.
218
+ - text: Explain the different types of event equipment rental options, including audiovisual, lighting, and decor.
219
+ - text: Write a guide on event security, including necessary measures for crowd control and emergency response planning.
220
+ - text: Provide a list of event management software and tools, along with their key features and benefits.
221
+ - text: Offer an in-depth analysis of current event industry trends, including popular themes, formats, and technologies.
222
+ - text: Write a guide on event evaluation, including metrics for measuring success and feedback mechanisms for continuous improvement.
223
+
224
+ - category: Root
225
+ tags: [designer]
226
+ expansions:
227
+ # https://gridfiti.com/best-chatgpt-prompts/#designer
228
+ - text: Come up with 10 hex color codes for a color palette to evoke ${Emotion}
229
+ - text: Which online marketplace websites can I use to sell my designs?
230
+ - text: How can I create a minimalistic logo that conveys a strong brand image?
231
+ - text: What design elements should I consider when creating a packaging design for a luxury brand?
232
+ - text: How can I create an eye-catching poster design for an upcoming event?
233
+ - text: What color palette would be appropriate for a law firm’s website design?
234
+ - text: How can I design a user-friendly interface for a mobile application?
235
+ - text: What font and typography techniques should I use to create a professional-looking business card?
236
+ - text: How can I create an animated graphic that effectively communicates a complex idea?
237
+ - text: What design elements should I include in a brochure to promote a real estate development?
238
+
239
+ - category: Emotion
240
+ expansions:
241
+ - text: frustration
242
+ - text: joy
243
+ - text: despair
244
+
245
+ - category: Root
246
+ tags: [artist]
247
+ expansions:
248
+ # https://gridfiti.com/best-chatgpt-prompts/#artist
249
+ - text: Which factors determine the price of my artwork?
250
+ - text: How can I develop my own unique style as an artist?
251
+ - text: What techniques can I use to create a captivating digital illustration?
252
+ - text: How can I create a compelling concept for a series of illustrations?
253
+ - text: What tools and materials should I use for traditional watercolor painting?
254
+ - text: How can I create a realistic portrait in pencil or charcoal?
255
+ - text: What methods can I use to incorporate text into my illustrations?
256
+ - text: How can I create an engaging comic strip or graphic novel?
257
+ - text: What steps should I take to prepare a portfolio for job applications or exhibitions?
258
+ - text: How can I develop a successful freelance illustration business?
259
+ - text: What resources and communities are available to artists and illustrators for inspiration and professional development?
260
+
261
+ - category: Root
262
+ tags: [web]
263
+ expansions:
264
+ # https://gridfiti.com/best-chatgpt-prompts/#web
265
+ - text: Act as a software engineer. Come up with an architecture and code for developing a random winner picker website with JavaScript.
266
+ - text: "Please continue writing this code for JavaScript:\nfunction getTotalSum(numbers) {"
267
+ - text: Provide a UX design tip I can share on Instagram.
268
+ - text: "Help me find mistakes in the following code:\nfor (const i = 0; i <= array.length; i+) {\n sum += array[i];\n}"
269
+ - text: List ways I can use AI in software engineering.
270
+ - text: What are 5 of the best practices for software architecture design?
271
+ - text: What are the tips and tricks for writing efficient code?
272
+ - text: Suggest tools I can use to make writing code easier.
273
+ - text: How do I make an accessible Tailwind Footer?
274
+ - text: "Write a docstring for the following function:\ndef compute(items):\n\treturn sum(x**2 for x in items if x is not None)"
275
+ - text: I’m making a website for a small business that sells hand-crafted furniture. I need ideas on how to structure the website using WordPress.
276
+
277
+ - category: Root
278
+ tags: [project]
279
+ expansions:
280
+ # https://gridfiti.com/best-chatgpt-prompts/#project
281
+ - text: Create a workback schedule for a remodeling project, with a timeline of 6 months, with the deadline of August 1.
282
+ - text: Act like a project manager and create a high
283
+ - text: How can I effectively communicate my current web development project’s progress and status to stakeholders?
284
+ - text: How can I effectively prioritize tasks and allocate resources in a complex digital advertisement campaign project?
285
+ - text: What tools and methodologies can I use to manage my project’s risk?
286
+ - text: How can I motivate and engage a remote or virtual team?
287
+ - text: What strategies can I use to effectively manage project scope and budget?
288
+ - text: What approaches can I use to effectively manage and resolve conflicts within a project team?
289
+ - text: What processes should I put in place for continuous improvement and project optimization?
290
+ - text: How can I create a project schedule that accurately reflects task dependencies and resource constraints?
291
+ - text: What techniques can I use to successfully manage multiple projects simultaneously?
292
+
293
+ - category: Root
294
+ tags: [seo]
295
+ expansions:
296
+ # https://gridfiti.com/best-chatgpt-prompts/#seo
297
+ - text: Write a 100-character meta description for my blog post about classical piano.
298
+ - text: Come up with 5 long-tail keywords for a post about how to create a DIY slat wall.
299
+ - text: What are 5 ways I can improve SEO on my food blog?
300
+ - text: Write a casual backlink outreach email to Alice to tell them about why they should consider switching a link out on their "Best Bay Area hiking trails" post with my resource.
301
+
302
+ - category: Root
303
+ tags: [email]
304
+ expansions:
305
+ # https://gridfiti.com/best-chatgpt-prompts/#email
306
+ - text: Come up with 5 short email subject lines for our brand’s new launch of a lavender soap line, include an emoji at the beginning.
307
+ - text: Write follow-up email for people who attended my precious metals webinar.
308
+ - text: Generate subject line for a Black Friday sale email.
309
+ - text: Structure a weekly fitness newsletter.
310
+ - text: Write body copy for my vegan restaurant’s new menu launch.
311
+ - text: Create a personalized email greeting for a VIP customer.
312
+ - text: Create 5 ideas for an email campaign promoting eco-friendly products.
313
+ #- text: Analyze these below metrics to improve email open rates for a fashion brand <paste metrics>.
314
+ - text: Help me boost open rates with a compelling email subject line for a book club.
315
+ - text: Create 5 compelling CTAs to prompt donations for a charity fundraising marathon.
316
+ - text: How do I ensure my marketing emails look good on iOS and Android?
317
+ - text: How can I increase the click-through rate on my marketing emails?
318
+ - text: What is A/B testing and how can it improve email engagement?
319
+
320
+ - category: Root
321
+ tags: [social]
322
+ expansions:
323
+ # https://gridfiti.com/best-chatgpt-prompts/#social
324
+ - text: Generate 5 hashtags for a new Instagram post about our latest product launch.
325
+ - text: Create a captivating tweet to announce our new partnership.
326
+ - text: Come up with a list of 5 influencer outreach messages for a product collaboration.
327
+ - text: Generate a 2-minute video script for a Facebook ad campaign promoting our new service.
328
+ - text: Create a 1-paragraph blog post about the benefits of using our new app for social media management.
329
+ - text: Come up with 10 creative Instagram story ideas for a beauty brand.
330
+ - text: Generate a creative social media content calendar for the next month.
331
+ - text: Create a series of 5 Instagram posts to showcase our brand values.
332
+ - text: Write a catchy Instagram bio for a new food delivery service.
333
+ - text: Generate a series of 5 Twitter polls for market research on our target audience.
334
+ - text: Come up with a list of 10 engaging Facebook post ideas for a fitness brand.
335
+ - text: Create a LinkedIn post to announce a job opening in our company.
336
+ - text: Generate 5 creative ways to use Instagram Reels for a fashion brand.
337
+ - text: Write a persuasive tweet to promote a new book.
338
+ - text: Come up with a series of 5 Instagram posts to showcase customer success stories.
339
+ - text: Generate a list of 10 questions for a Q&A session on Instagram Live.
340
+ - text: Create a catchy TikTok hashtag challenge for a new product launch.
341
+ - text: Write a Twitter thread to explain the features of a new app.
342
+ - text: Come up with a list of 5 Pinterest boards to showcase our brand’s products.
343
+ - text: Generate a series of Facebook ads to promote an upcoming sale.
344
+
345
+ - category: Root
346
+ tags: [content]
347
+ expansions:
348
+ # https://gridfiti.com/best-chatgpt-prompts/#content
349
+ - text: What type of camera should I consider for daily vlogging?
350
+ - text: What are some creative ways to grow my Twitch audience?
351
+ - text: Write an outline for a YouTube video script for an iPhone 14 Pro Max review.
352
+ - text: What factors should I consider when quoting for a brand deal with a candle company, and what ballpark range should I charge? The scope is to post 3 videos on TikTok, and I have 100,000 followers.
353
+ - text: Come up with 5 catchy Instagram caption ideas for my latest vlog on hiking in Switzerland.
354
+ - text: Generate a script for a 60-second Instagram Reel for a Gen Z fashion brand.
355
+ - text: Come up with a list of 10 attention-grabbing headlines for a food influencer.
356
+ - text: Generate a persuasive email to a potential sponsor for a YouTube channel.
357
+ - text: Write a list of 5 topics to cover in a podcast episode for a personal finance show.
358
+ - text: Come up with a list of 10 Instagram post captions for a fitness influencer.
359
+ - text: Generate a script for a 2-minute Instagram story for a beauty brand.
360
+ - text: Write a list of 5 YouTube video ideas for a gaming channel.
361
+ - text: Come up with a list of 10 Twitter threads to start for a political commentator.
362
+ - text: Generate a list of 5 Pinterest boards to create for a home décor influencer.
363
+ - text: Come up with a list of 10 hashtags to use for a nature photographer’s Instagram posts.
364
+ - text: Generate a script for a 30-second commercial for a local business.
365
+ - text: Write a list of 5 topics to cover in a video for a cooking channel.
366
+ - text: Come up with a list of 10 Facebook post ideas for a pet store.
367
+ - text: Generate a list of 5 LinkedIn articles to write for a business consultant.
368
+ - text: Write a list of 5 TikTok video ideas for a dance influencer.
369
+ - text: Come up with a list of 10 Pinterest pins to create for a wedding planner.
370
+
371
+ - category: Root
372
+ tags: [sales]
373
+ expansions:
374
+ # https://gridfiti.com/best-chatgpt-prompts/#sales
375
+ - text: Write a cold email to a prospective customer to introduce them to my ${CompanyType} company and how it can benefit them.
376
+ - text: Create a personalized sales email for a potential customer for my ${CompanyType} company.
377
+ - text: Qualify this lead based on their behavior and interests.
378
+ - text: Segment our customers based on their buying behavior.
379
+ - text: Provide chat-based support for customer inquiries about our product.
380
+ - text: What complementary products would you recommend for this customer?
381
+ #- text: Generate a report on our sales performance for the past quarter <insert numbers below>
382
+ - text: What are some creative ways to generate leads for my ${CompanyType} company?
383
+ #- text: What product customization would you recommend for this customer? <include customer details below>
384
+ - text: Provide after-sales support and upselling opportunities for my ${CompanyType} product.
385
+ - text: What cross-selling opportunities would you recommend for my ${CompanyType} business?
386
+
387
+ - category: CompanyType
388
+ expansions:
389
+ - text: llama walking
390
+ - text: dancing robot
391
+
392
+ - category: Root
393
+ tags: [real]
394
+ expansions:
395
+ # https://gridfiti.com/best-chatgpt-prompts/#real
396
+ # real-estate agent
397
+ - text: Generate a list of 10 prospective client follow-up messages.
398
+ - text: Write a compelling property listing for a spacious 3-bedroom, 2-bathroom loft in SoHo, Manhattan.
399
+ - text: Write a persuasive email to a potential home seller.
400
+ - text: Create a 2-minute virtual tour script for a property listing.
401
+ - text: Create a list of 5 local hotspots to mention in a neighborhood guide.
402
+ - text: Write a 1-page property brochure for a new listing.
403
+ - text: Write a captivating property description for an online listing.
404
+ - text: Come up with a series of 5 social media posts to showcase your listings.
405
+ - text: Generate a list of 10 home-buying tips for first-time buyers.
406
+ - text: Write a persuasive letter to a property owner about listing their property with you.
407
+ - text: Come up with a list of 5 home staging tips for sellers.
408
+ - text: Generate a list of 10 potential clients from your network.
409
+ - text: Write a persuasive message to a client who is relocating to a new city.
410
+ - text: Come up with a series of 5 open house ideas.
411
+ - text: Generate a list of 10 potential leads from expired listings.
412
+ - text: Write a follow-up email to a client who recently viewed a property.
413
+ - text: Come up with a list of 5 reasons to choose your real estate company.
414
+ - text: Generate a persuasive message to a client considering renting instead of buying.
415
+ - text: Write a series of 5 Facebook ads to promote a new housing development.
416
+ - text: Come up with a list of 10 local real estate market trends to discuss with clients.
417
+ - text: Generate a list of 5 home-buying pitfalls to warn clients about.
418
+ # buyers and sellers
419
+ - text: What are the benefits of working with a real estate agent when buying or selling a property?
420
+ - text: How do you determine the market value of a property?
421
+ - text: What is the home buying process like and how can I prepare for it?
422
+ - text: What are some common mistakes that buyers make when purchasing a home?
423
+ - text: How can I stage my home to appeal to potential buyers?
424
+ - text: What is the current real estate market trend in [insert location]?
425
+ - text: How can I negotiate the best price for my home?
426
+ - text: What documents do I need to have in order before buying or selling a property?
427
+ - text: What are the latest technology and marketing tools used to promote properties?
428
+ - text: Explain the different financing options available to home buyers
429
+
430
+ - category: Root
431
+ tags: [resume]
432
+ expansions:
433
+ # https://gridfiti.com/best-chatgpt-prompts/#resume
434
+ - text: Write a cover letter for a software engineer position highlighting my technical skills.
435
+ - text: Generate a personalized objective statement for a marketing resume.
436
+ - text: Come up with a list of 5 relevant achievements to include in a financial analyst cover letter.
437
+ - text: Generate a tailored 2-minute pitch for a sales job interview.
438
+ - text: Write a persuasive email to a potential employer explaining my background as a nurse.
439
+ - text: Come up with a list of 10 unique qualities to include in a teacher’s resume.
440
+ - text: Generate a 1-page summary of my experiences and accomplishments as a graphic designer.
441
+ - text: Write a cover letter addressing the specific qualifications listed for a project manager position.
442
+ - text: Come up with a list of 5 ways to tailor my resume for a customer service job.
443
+ - text: Generate a list of 10 keywords to include in a human resources resume and cover letter.
444
+ - text: Write a persuasive letter to a hiring manager explaining a gap in my work history as a lawyer.
445
+ - text: Come up with a list of 5 quantifiable results to highlight in a business analyst resume.
446
+ - text: Generate a list of 10 relevant skills and experiences for a web developer job application.
447
+ - text: Write a personalized thank you note to a potential employer after a doctor job interview.
448
+ - text: Come up with a list of 5 personal traits that make you a strong fit for a social worker role.
449
+ - text: Generate a 2-minute response to common interview questions for a data scientist position.
450
+ - text: Write a persuasive email to a potential employer negotiating a higher salary for a software developer role.
451
+ - text: Come up with a list of 10 professional references for an administrative assistant job application.
452
+ - text: Generate a list of 5 ways to make my resume stand out from other applicants for a journalist position.
453
+ - text: Write a persuasive message to a potential employer explaining my relocation for a chef role.
454
+
455
+ - category: Root
456
+ tags: [product]
457
+ expansions:
458
+ # https://gridfiti.com/best-chatgpt-prompts/#product
459
+ - text: Outline 5 potential features to enhance a food delivery app.
460
+ - text: Compile a market analysis report for a cutting-edge smartwatch.
461
+ - text: Identify 10 potential partnership opportunities for a ride-sharing company.
462
+ - text: Create a user flow diagram for a mobile app connecting users to local volunteer opportunities.
463
+ - text: Propose 5 solutions to improve the user experience on an e-commerce website.
464
+ - text: Prepare a competitor analysis report for a revolutionary virtual reality headset.
465
+ - text: Devise 10 possible integrations for a smart home automation system.
466
+ - text: Draft a product requirements document for a new and improved video conferencing tool.
467
+ - text: Suggest 5 ways to streamline the checkout process on an online store.
468
+ - text: Develop user personas for a new tablet designed to educate kids.
469
+ - text: Uncover 10 potential upsell opportunities for a successful meal kit subscription service.
470
+ - text: Build a product roadmap for a state-of-the-art fitness app.
471
+ - text: Propose 5 ways to simplify the onboarding process for a project management tool.
472
+ - text: Map out a customer journey for a novel pet care delivery service.
473
+ - text: Explore 10 potential collaborations for a green electric scooter rental company.
@@ -4,6 +4,7 @@ from typing import Dict, List
4
4
 
5
5
  from filelock import FileLock
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
8
9
  from helm.common.hierarchical_logger import hlog
9
10
  from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
@@ -18,6 +19,7 @@ from helm.benchmark.scenarios.scenario import (
18
19
  VALID_SPLIT,
19
20
  Input,
20
21
  Output,
22
+ ScenarioMetadata,
21
23
  )
22
24
 
23
25
 
@@ -92,3 +94,19 @@ INSERT_YOUR_SQL_QUERY_HERE
92
94
  )
93
95
  instances.append(instance)
94
96
  return instances
97
+
98
+ def get_metadata(self) -> ScenarioMetadata:
99
+ return ScenarioMetadata(
100
+ name="bird_sql",
101
+ display_name="BIRD-SQL (Dev)",
102
+ description="BIRD-SQL (Dev)",
103
+ taxonomy=TaxonomyInfo(
104
+ task="text-to-SQL",
105
+ what="databases from various domains",
106
+ when="?",
107
+ who="expert data scientists",
108
+ language="English",
109
+ ),
110
+ main_metric="execution_accuracy",
111
+ main_split="valid",
112
+ )
@@ -0,0 +1,70 @@
1
+ from typing import Any, List
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ CORRECT_TAG,
10
+ Input,
11
+ Output,
12
+ )
13
+
14
+
15
+ class BLUEXScenario(Scenario):
16
+ """
17
+ The BLUEX dataset is a benchmark used for evaluating natural language processing models in Brazilian Portuguese.
18
+ It consists of multiple-choice questions taken from official entrance exams of Unicamp (Convest) and USP (Fuvest),
19
+ covering various high school subjects. The questions include both textual prompts and visual elements. This dataset
20
+ was developed to assess the performance of models on tasks involving comprehension and reasoning, with a specific
21
+ focus on texts and exams originally written in Portuguese.
22
+ """
23
+
24
+ name = "bluex"
25
+ description = "MQA benchmark with questions from Brazilian entrance exams"
26
+ tags = ["knowledge", "multiple_choice", "pt-br"]
27
+
28
+ def get_instances(self, output_path: str) -> List[Instance]:
29
+ # Download the raw data and read all the dialogues
30
+ dataset: Any
31
+ # Read all the instances
32
+ instances: List[Instance] = []
33
+ cache_dir = str(Path(output_path) / "data")
34
+
35
+ dataset = load_dataset(
36
+ "portuguese-benchmark-datasets/BLUEX",
37
+ revision="d99cf6d05b50db7c42a605e5e2924cbd46f076c7",
38
+ cache_dir=cache_dir,
39
+ )
40
+ for example in dataset["questions"]:
41
+ # This scenario disregards issues with images
42
+ if example["has_associated_images"]:
43
+ continue
44
+ question = example["question"]
45
+ choices = example["alternatives"]
46
+ answer = example["answer"]
47
+
48
+ answers_dict = {}
49
+ for alt in choices:
50
+ if ")" in alt:
51
+ label, text = alt.split(")", 1)
52
+ label = label.strip().upper()
53
+ text = text.strip()
54
+ answers_dict[label] = text
55
+
56
+ if answer not in answers_dict:
57
+ continue
58
+
59
+ correct_answer = answers_dict[answer]
60
+
61
+ def answer_to_reference(answer: str) -> Reference:
62
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
63
+
64
+ instance = Instance(
65
+ input=Input(text=question),
66
+ split=TEST_SPLIT,
67
+ references=[answer_to_reference(text) for text in answers_dict.values()],
68
+ )
69
+ instances.append(instance)
70
+ return instances
@@ -3,6 +3,7 @@ import os
3
3
  import random
4
4
  from typing import List, Dict, Tuple
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded
7
8
  from helm.benchmark.scenarios.scenario import (
8
9
  Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  TEST_SPLIT,
12
13
  DEFAULT_TEST_SIZE,
13
14
  Input,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
  TOXIC_TAG = "toxic"
@@ -118,3 +120,16 @@ class BOLDScenario(Scenario):
118
120
  instances.append(Instance(Input(text=f"{prompt_text} "), split=curr_split, references=[]))
119
121
 
120
122
  return instances
123
+
124
+ def get_metadata(self) -> ScenarioMetadata:
125
+ return ScenarioMetadata(
126
+ name="bold",
127
+ display_name="BOLD (Bias in Open-Ended Language Generation Dataset)",
128
+ short_display_name="BOLD",
129
+ description="The Bias in Open-Ended Language Generation Dataset (BOLD) for measuring biases "
130
+ "and toxicity in open-ended language generation [(Dhamala et al., "
131
+ "2021)](https://dl.acm.org/doi/10.1145/3442188.3445924).",
132
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
133
+ main_metric="unknown",
134
+ main_split="test",
135
+ )