crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -1,16 +1,70 @@
1
1
  import json
2
- from typing import Dict
2
+ import re
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Optional, TypedDict, Union, Callable, Any, Set
3
5
 
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.annotation.annotator import Annotator
4
8
  from helm.clients.auto_client import AutoClient
9
+ from helm.common.hierarchical_logger import hlog
5
10
  from helm.common.request import Request
6
11
 
7
12
 
13
+ class AnnotatorResponseParseFailure(Exception):
14
+ def __init__(self, response_text: str, **kwargs):
15
+ self.response_text = response_text
16
+ super().__init__(kwargs)
17
+
18
+
19
+ @dataclass
20
+ class AnnotatorModelInfo:
21
+ model_name: str
22
+ model_deployment: str
23
+
24
+
25
+ def score_with_reasoning_with_gpt_and_llama(
26
+ auto_client: AutoClient,
27
+ annotator_prompt: str,
28
+ ) -> Dict[str, Optional[Union[str, float]]]:
29
+ """EXPERIMENTAL: DO NOT USE IN PRODUCTION
30
+
31
+ Score using GPT-4o and Llama 3.1 for safety scenarios in HELM Safety."""
32
+ # TODO: Make this configurable
33
+ SHORT_NAME_TO_MODEL_INFO: Dict[str, AnnotatorModelInfo] = {
34
+ "gpt": AnnotatorModelInfo(model_name="openai/gpt-4o-2024-05-13", model_deployment="openai/gpt-4o-2024-05-13"),
35
+ "llama": AnnotatorModelInfo(
36
+ model_name="meta/llama-3.1-405b-instruct-turbo", model_deployment="together/llama-3.1-405b-instruct-turbo"
37
+ ),
38
+ }
39
+ result: Dict[str, Optional[Union[str, float]]] = {"prompt_text": annotator_prompt}
40
+ for short_name, model_info in SHORT_NAME_TO_MODEL_INFO.items():
41
+ try:
42
+ score_and_reasoning = score_with_reasoning(
43
+ auto_client,
44
+ annotator_prompt,
45
+ annotator_model=model_info.model_name,
46
+ annotator_model_deployment=model_info.model_deployment,
47
+ )
48
+ result[f"{short_name}_score"] = score_and_reasoning["score"]
49
+ result[f"{short_name}_reasoning"] = score_and_reasoning["reasoning"]
50
+ except AnnotatorResponseParseFailure as e:
51
+ result[f"{short_name}_score"] = None
52
+ result[f"{short_name}_reasoning"] = None
53
+ result[f"{short_name}_response_text"] = e.response_text
54
+ return result
55
+
56
+
57
+ class ScoreAndReasoning(TypedDict):
58
+ score: float
59
+ reasoning: str
60
+
61
+
8
62
  def score_with_reasoning(
9
63
  auto_client: AutoClient,
10
64
  annotator_prompt: str,
11
65
  annotator_model: str,
12
66
  annotator_model_deployment: str,
13
- ) -> Dict:
67
+ ) -> ScoreAndReasoning:
14
68
  annotator_request = Request(
15
69
  model=annotator_model,
16
70
  model_deployment=annotator_model_deployment,
@@ -23,23 +77,232 @@ def score_with_reasoning(
23
77
  raise Exception(f"Annotation request failed: {annotator_response.error}")
24
78
  assert len(annotator_response.completions) == 1
25
79
  annotator_response_text = annotator_response.completions[0].text
26
- json_start_index = annotator_response_text.find("{")
27
- json_end_index = annotator_response_text.rfind("}")
28
- if json_start_index < 0 or json_end_index < 0:
29
- raise Exception(f"Malformed annotator response: {annotator_response_text}")
30
- annotator_response_json = annotator_response_text[json_start_index : json_end_index + 1]
31
- try:
32
- parsed_response = json.loads(annotator_response_json)
33
- except Exception as e:
34
- raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
35
-
36
- if not parsed_response:
37
- raise Exception(f"Malformed annotator response: {annotator_response_text}")
38
80
 
81
+ # fuzzy match regex check, allows for different casing, or forgetting / in end tag
82
+ reasoning_match = re.search(
83
+ r"<\s*reasoning\s*>(.*?)<\/?\s*reasoning\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
84
+ )
85
+ score_match = re.search(r"<\s*score\s*>(.*?)<\/?\s*score\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE)
86
+ if not reasoning_match or not score_match:
87
+ raise AnnotatorResponseParseFailure(
88
+ message=f"Could not parse markup in raw response: '{annotator_response_text}'",
89
+ response_text=annotator_response_text,
90
+ )
91
+ reasoning = reasoning_match.group(1).strip()
39
92
  try:
40
- score = float(parsed_response.get("score"))
41
- reasoning = parsed_response.get("reasoning").strip()
42
- except ValueError as e:
43
- raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
93
+ score = float(score_match.group(1).strip())
94
+ except ValueError:
95
+ raise AnnotatorResponseParseFailure(
96
+ message=f"Could not parse score as float from raw request: '{annotator_response_text}'",
97
+ response_text=annotator_response_text,
98
+ )
44
99
 
45
100
  return {"reasoning": reasoning, "score": score}
101
+
102
+
103
+ class LLMAsJuryAnnotator(Annotator):
104
+ """
105
+ A flexible LLM-based annotator that can be configured for different annotation scenarios.
106
+
107
+ This annotator supports:
108
+ - Custom prompt templates
109
+ - Multiple evaluation models
110
+ - Configurable evaluation criteria
111
+ - Robust error handling
112
+ """
113
+
114
+ def __init__(
115
+ self,
116
+ auto_client: AutoClient,
117
+ prompt_template: str,
118
+ annotation_criteria: Dict[str, Set[str]],
119
+ annotator_models: Dict[str, AnnotatorModelInfo],
120
+ preprocessor: Optional[Callable[[str], str]] = None,
121
+ ):
122
+ """
123
+ Initialize the LLMAsJuryAnnotator.
124
+
125
+ :param auto_client: Client for making API requests
126
+ :param prompt_template: Template for generating prompts
127
+ :param annotation_criteria: Dictionary defining expected annotation structure
128
+ :param annotator_models: Dictionary of models to use for annotation
129
+ :param preprocessor: Optional function to preprocess model responses
130
+ """
131
+ self._auto_client = auto_client
132
+ self._prompt_template = prompt_template
133
+ self._annotation_criteria = annotation_criteria
134
+ self._annotator_models = annotator_models
135
+ self._preprocessor = preprocessor or self._sanitize_model_response
136
+
137
+ def _sanitize_model_response(self, model_response: str) -> str:
138
+ """
139
+ Sanitize the model response to extract JSON.
140
+
141
+ :param model_response: Raw model response
142
+ :return: Extracted JSON string
143
+ """
144
+ json_match = re.search(r"\{.*\}", model_response, re.DOTALL)
145
+ return json_match.group(0) if json_match else model_response
146
+
147
+ def _interpolate_prompt(
148
+ self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None
149
+ ) -> str:
150
+ """
151
+ Interpolate prompt template with request state information.
152
+
153
+ :param request_state: The current request state
154
+ :param custom_replacements: Optional dictionary of additional replacements
155
+ :return: Interpolated prompt
156
+ """
157
+ base_replacements = {
158
+ "{{QUESTION}}": request_state.instance.input.text,
159
+ "{{RESPONSE}}": (
160
+ request_state.result.completions[0].text
161
+ if request_state.result and request_state.result.completions
162
+ else ""
163
+ ),
164
+ "{{GOLD_RESPONSE}}": request_state.instance.references[0].output.text,
165
+ }
166
+
167
+ # Allow custom replacements to override base replacements
168
+ if custom_replacements:
169
+ base_replacements.update(custom_replacements)
170
+
171
+ prompt = self._prompt_template
172
+ for key, value in base_replacements.items():
173
+ prompt = prompt.replace(key, str(value))
174
+
175
+ return prompt
176
+
177
+ def _validate_annotation(self, annotator_criteria: Dict[str, Any], annotator_name: str) -> bool:
178
+ """
179
+ Validate the annotation meets expected criteria.
180
+
181
+ :param annotator_criteria: Annotation dictionary to validate
182
+ :param annotator_name: Name of the annotator model
183
+ :return: Whether the annotation is valid
184
+ """
185
+ for key, value in self._annotation_criteria.items():
186
+ if key not in annotator_criteria:
187
+ hlog(
188
+ f"WARNING: Annotator did not find the expected key "
189
+ f"'{key}' in the response from {annotator_name}."
190
+ )
191
+ return False
192
+
193
+ for subkey in value:
194
+ if subkey not in annotator_criteria[key]:
195
+ hlog(
196
+ f"WARNING: Annotator did not find the expected subkey "
197
+ f"'{subkey}' in the response from {annotator_name}."
198
+ )
199
+ return False
200
+ return True
201
+
202
+ def annotate(self, request_state: RequestState) -> Dict[str, Any]:
203
+ """
204
+ Annotate the request state using multiple LLM models.
205
+
206
+ :param request_state: The request state to annotate
207
+ :return: Dictionary of annotations from different models
208
+ """
209
+ assert request_state.result
210
+ assert len(request_state.result.completions) == 1
211
+
212
+ # Check for empty model output
213
+ model_output_text = request_state.result.completions[0].text
214
+ if not model_output_text.strip():
215
+ hlog("WARNING: Annotator skipped sending requests because the model response was empty")
216
+ return {
217
+ "prompt_text": None,
218
+ "empty_output_equivalence_judgement": False,
219
+ }
220
+
221
+ # Prepare prompt
222
+ annotator_prompt = self._interpolate_prompt(request_state)
223
+ annotations: Dict[str, Union[Optional[str], Optional[bool], Dict[str, Any]]] = {"prompt_text": annotator_prompt}
224
+
225
+ # Track failed annotations for each model
226
+ failed_counts: Dict[str, int] = {name: 0 for name in self._annotator_models}
227
+
228
+ # Annotate using multiple models
229
+ for annotator_name, annotator_model_info in self._annotator_models.items():
230
+ try:
231
+ annotator_criteria = self._annotate_with_model(annotator_prompt, annotator_model_info, annotator_name)
232
+
233
+ if annotator_criteria is not None:
234
+ annotations[annotator_name] = annotator_criteria
235
+ else:
236
+ failed_counts[annotator_name] += 1
237
+
238
+ except Exception as e:
239
+ hlog(f"ERROR annotating with {annotator_name}: {e}")
240
+ failed_counts[annotator_name] += 1
241
+
242
+ hlog(f"Failed model annotations: {failed_counts}")
243
+ return annotations
244
+
245
+ def _annotate_with_model(
246
+ self, prompt: str, model_info: AnnotatorModelInfo, annotator_name: str
247
+ ) -> Optional[Dict[str, Any]]:
248
+ """
249
+ Annotate using a specific model with enhanced JSON parsing.
250
+
251
+ :param prompt: Interpolated prompt
252
+ :param model_info: Model information
253
+ :param annotator_name: Name of the annotator
254
+ :return: Annotation criteria or None if failed
255
+ """
256
+ annotator_request = Request(
257
+ model=model_info.model_name,
258
+ model_deployment=model_info.model_deployment,
259
+ prompt=prompt,
260
+ temperature=0.0,
261
+ max_tokens=4096,
262
+ )
263
+
264
+ annotator_response = self._auto_client.make_request(annotator_request)
265
+
266
+ if not annotator_response.success:
267
+ hlog(f"WARNING: Got an error response from {model_info.model_name}: " f"{annotator_response.error}")
268
+ return None
269
+
270
+ try:
271
+ annotator_output = annotator_response.completions[0].text
272
+ annotator_output = self._preprocessor(annotator_output)
273
+
274
+ try:
275
+ annotator_criteria = json.loads(annotator_output)
276
+ except json.JSONDecodeError as e:
277
+ if e.msg == "Expecting ',' delimiter":
278
+ # Attempt to fix incomplete JSON by adding a closing brace
279
+ annotator_output = annotator_output + "}"
280
+ try:
281
+ annotator_criteria = json.loads(annotator_output)
282
+ except Exception as ex:
283
+ hlog(
284
+ f"WARNING: Error parsing response from {model_info.model_name} "
285
+ f"after adding closing brace: {ex}. "
286
+ f"Model output: {annotator_output}"
287
+ )
288
+ return None
289
+ else:
290
+ # For other JSON decoding errors
291
+ hlog(
292
+ f"WARNING: JSON decoding error from {model_info.model_name}: {e}. "
293
+ f"Model output: {annotator_output}"
294
+ )
295
+ return None
296
+
297
+ # Validate annotation structure
298
+ if not self._validate_annotation(annotator_criteria, annotator_name):
299
+ return None
300
+
301
+ return annotator_criteria
302
+
303
+ except Exception as e:
304
+ hlog(
305
+ f"WARNING: Unexpected error processing response from {model_info.model_name}: {e}. "
306
+ f"Model output: {annotator_output}"
307
+ )
308
+ return None
@@ -0,0 +1,98 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
8
+ generated response of a clinical scenario.
9
+ Your goal is to assess how well the response addresses the patient case, follows clinical best practices,
10
+ and compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+ The user's request will be provided in these tags:
13
+ <user_request>
14
+ {{QUESTION}}
15
+ </user_request>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ Some potential correct responses will be provided in these tags:
23
+ <gold_response>
24
+ {{GOLD_RESPONSE}}
25
+ </gold_response>
26
+
27
+ Carefully analyze the <response>.
28
+ For each of the following categories, rate the Response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
29
+ and provide a short justification for your score.
30
+
31
+ Evaluation Criteria:
32
+ Accuracy (1-5)
33
+ - Does the response provide correct medical advice based on clinical guidelines and the patient case?
34
+
35
+ Completeness (1-5)
36
+ - Does the response cover all important aspects of care for this clinical scenario?
37
+
38
+ Clarity (1-5)
39
+ - Is the response written clearly and organized in a way that clinicians can easily follow?
40
+
41
+ Output Format:
42
+ Output the evaluation as a single valid JSON object matching the following structure:
43
+ {
44
+ "accuracy": {
45
+ "score": 0,
46
+ "explanation": "Explain why this score was given."
47
+ },
48
+ "completeness": {
49
+ "score": 0,
50
+ "explanation": "Explain why this score was given."
51
+ },
52
+ "clarity": {
53
+ "score": 0,
54
+ "explanation": "Explain why this score was given."
55
+ }
56
+ }
57
+
58
+ Ensure the output is valid JSON:
59
+ - Use **double quotes** (") for all keys and string values.
60
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
61
+ maintain valid JSON formatting.
62
+ - Do not include any additional information in the output.
63
+ """
64
+
65
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
66
+ "accuracy": {"score", "explanation"},
67
+ "completeness": {"score", "explanation"},
68
+ "clarity": {"score", "explanation"},
69
+ }
70
+
71
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
72
+ "gpt": AnnotatorModelInfo(
73
+ model_name="openai/gpt-4o-2024-05-13",
74
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
75
+ ),
76
+ "llama": AnnotatorModelInfo(
77
+ model_name="meta/llama-3.3-70b-instruct",
78
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
79
+ ),
80
+ "claude": AnnotatorModelInfo(
81
+ model_name="anthropic/claude-3-7-sonnet-20250219",
82
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
83
+ ),
84
+ }
85
+
86
+
87
+ class MTSamplesProceduresAnnotator(LLMAsJuryAnnotator):
88
+ """The MTSamplesProcedures autograder."""
89
+
90
+ name = "mtsamples_procedures"
91
+
92
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
93
+ super().__init__(
94
+ auto_client=auto_client,
95
+ prompt_template=PROMPT_TEMPLATE,
96
+ annotation_criteria=ANNOTATION_CRITERIA,
97
+ annotator_models=ANNOTATOR_MODELS,
98
+ )
@@ -0,0 +1,101 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert responsible for evaluating a proposed treatment plan
8
+ based on provided patient information.
9
+
10
+ Your goal is to assess whether the treatment plan uses the patient's medical history,
11
+ medications, symptoms, and other relevant details appropriately, follows clinical best practices,
12
+ and aligns with the gold standard response in accuracy, completeness, and clarity.
13
+
14
+ The patient's information will be provided in these tags:
15
+ <patient_information>
16
+ {{QUESTION}}
17
+ </patient_information>
18
+
19
+ The proposed treatment plan will be provided in these tags:
20
+ <response>
21
+ {{RESPONSE}}
22
+ </response>
23
+
24
+ The gold standard treatment plan will be provided in these tags:
25
+ <gold_response>
26
+ {{GOLD_RESPONSE}}
27
+ </gold_response>
28
+
29
+ Carefully review the <response> based on the <patient_information> and compare it to the <gold_response> when needed.
30
+
31
+ For each category below, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent) and
32
+ provide a brief explanation for your score:
33
+
34
+ Evaluation Criteria:
35
+ Accuracy (1-5)
36
+ - Does the treatment plan provide correct medical advice based on clinical guidelines?
37
+
38
+ Completeness (1-5)
39
+ Does the treatment plan include all important medical details?
40
+
41
+ Clarity (1-5)
42
+ - Is the treatment plan written clearly so clinicians can easily understand it?
43
+
44
+ Output Format:
45
+ Generate a valid JSON object with the following structure:
46
+ {
47
+ "accuracy": {
48
+ "score": 0,
49
+ "explanation": "Explain why this score was given."
50
+ },
51
+ "completeness": {
52
+ "score": 0,
53
+ "explanation": "Explain why this score was given."
54
+ },
55
+ "clarity": {
56
+ "score": 0,
57
+ "explanation": "Explain why this score was given."
58
+ }
59
+ }
60
+
61
+ Ensure the output is valid JSON:
62
+ - Use **double quotes** (") for all keys and string values.
63
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
64
+ maintain valid JSON formatting.
65
+ - Do not include any additional information in the output.
66
+ """
67
+
68
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
69
+ "accuracy": {"score", "explanation"},
70
+ "completeness": {"score", "explanation"},
71
+ "clarity": {"score", "explanation"},
72
+ }
73
+
74
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
75
+ "gpt": AnnotatorModelInfo(
76
+ model_name="openai/gpt-4o-2024-05-13",
77
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
78
+ ),
79
+ "llama": AnnotatorModelInfo(
80
+ model_name="meta/llama-3.3-70b-instruct",
81
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
82
+ ),
83
+ "claude": AnnotatorModelInfo(
84
+ model_name="anthropic/claude-3-7-sonnet-20250219",
85
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
86
+ ),
87
+ }
88
+
89
+
90
+ class MTSamplesReplicateAnnotator(LLMAsJuryAnnotator):
91
+ """The MTSamplesReplicate autograder."""
92
+
93
+ name = "mtsamples_replicate"
94
+
95
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
96
+ super().__init__(
97
+ auto_client=auto_client,
98
+ prompt_template=PROMPT_TEMPLATE,
99
+ annotation_criteria=ANNOTATION_CRITERIA,
100
+ annotator_models=ANNOTATOR_MODELS,
101
+ )
@@ -0,0 +1,152 @@
1
+ # CONTEXT #
2
+ I am a teacher, and I have some high-level math problems. I am tasked with evaluating the correctness of a student's answer.
3
+ Below, I am provided with a problem and a reference answer. Additionally, a student's answer is provided. My job is to assess whether the student's answer captures the same meaning as the reference answer, even when expressed with different wording or format.
4
+
5
+ # OBJECTIVE #
6
+ I need you to judge whether the student's answer is correct given the ground truth answer.
7
+
8
+ Your tasks include:
9
+ A. Identify Mathematical or Notational Equivalence: Pay special attention to any LaTeX expressions in both answers. Confirm that the mathematical relationships, variables, and operations conveyed are equivalent.
10
+ B. Provide a Justification: Conclude with a brief explanation as to why you believe the student's output is correct or incorrect, highlighting any key differences in meaning or content.
11
+
12
+ # STYLE #
13
+ Teaching report.
14
+
15
+ # TONE #
16
+ Professional, scientific.
17
+
18
+ # AUDIENCE #
19
+ Students. Enable them to better understand whether the answer they produce is correct.
20
+
21
+ # RESPONSE: MARKDOWN REPORT #
22
+ ## Student Final Answer
23
+ [Extract the student's final answer, which is enclosed in "\\boxed{}".]
24
+ ## Equivalence Judgement
25
+ [Whether the student's answer share the same meaning with the reference answer. (TRUE or FALSE)]
26
+ ## Justification
27
+ [Conclude with a brief explanation as to why you believe the student's answer is correct or incorrect.]
28
+
29
+
30
+ # ATTENTION #
31
+ - The reference answer is ALWAYS correct. You should carefully judge whether the student gives the same answer as reference answer.
32
+ - The Equivalence Judgement is only TRUE or FALSE. The answer is FALSE even if the student's final answer almost correct with a minor mistakes.
33
+ - The answer is contained within the "boxed" section, so you can focus solely on comparing the content in the student's answer box with the reference answer, without needing to consider the intermediate steps.
34
+ - Add "=== report over ===" at the end of the report.
35
+
36
+ <example math solution>
37
+ **Question**:
38
+ The numbers $1447$ , $1005$ and $1231$ have something in common: each is a $4$ -digit number beginning with $1$ that has exactly two identical digits. How many such numbers are there?
39
+
40
+ **Reference Answer**
41
+ 432
42
+
43
+ **Student Solution**:
44
+ To determine how many 4-digit numbers beginning with 1 have exactly two identical digits, we need to consider the structure of these numbers. A number of this form can be represented as \(1abc\), where \(a\), \(b\), and \(c\) are digits from 0 to 9, and exactly one of the digits \(a\), \(b\), or \(c\) is 1, while the other two are different from each other and different from 1.
45
+
46
+ We can break down the problem into the following steps:
47
+
48
+ 1. **Choose the digit that is 1**: There are 3 positions (a, b, c) where the digit 1 can be placed. So, there are 3 choices for the position of the digit 1.
49
+
50
+ 2. **Choose the other two digits**: The remaining two digits must be different from each other and different from 1. Therefore, we have 9 choices for the first of these two digits (0-9 except 1) and 8 choices for the second digit (0-9 except 1 and the first digit). This gives us \(9 \times 8 = 72\) choices for the other two digits.
51
+
52
+ Combining these choices, the total number of 4-digit numbers beginning with 1 that have exactly two identical digits is:
53
+ \[3 \times 72 = 216.\]
54
+
55
+ Thus, the number of such numbers is \(\boxed{216}\).
56
+
57
+ </example math solution>
58
+
59
+ ## Student Final Answer
60
+ 216
61
+
62
+ ## Equivalence Judgement
63
+ FALSE
64
+
65
+ ## Justification
66
+ The student's answer of 216 is incorrect in the context of the problem, which asks for the total count of 4-digit numbers beginning with 1 that have exactly two identical digits. The reference answer is 432.
67
+ In the student's solution, they consider only cases where the digit '1' is one of the identical digits. However, the problem also includes the scenario where the identical digits could be different from '1'. Thus, the student's calculation does not account for all valid configurations. The discrepancy in figures indicates that the student's answer does not share the same meaning as the reference answer.
68
+
69
+ === report over ===
70
+
71
+
72
+ <example math solution>
73
+ **Question**:
74
+ Susana and Brenda play a game writing polynomials on the board. Susana starts and they play taking turns.
75
+ 1) On the preparatory turn (turn 0), Susana choose a positive integer $n_0$ and writes the polynomial $P_0(x)=n_0$.
76
+ 2) On turn 1, Brenda choose a positive integer $n_1$, different from $n_0$, and either writes the polynomial
77
+
78
+ $$P_1(x)=n_1x+P_0(x) \textup{ or } P_1(x)=n_1x-P_0(x)$$
79
+
80
+ 3) In general, on turn $k$, the respective player chooses an integer $n_k$, different from $n_0, n_1, \ldots, n_{k-1}$, and either writes the polynomial
81
+ $$P_k(x)=n_kx^k+P_{k-1}(x) \textup{ or } P_k(x)=n_kx^k-P_{k-1}(x)$$
82
+ The first player to write a polynomial with at least one whole whole number root wins. Find and describe a winning strategy.
83
+
84
+ **Reference Answer**
85
+ \\text{No winning strategy; the game can last indefinitely.}
86
+
87
+ **Student Solution**:
88
+ To determine a winning strategy for Susana, we need to analyze the game and the conditions under which a player can win. The goal is to write a polynomial that has at least one integer root.
89
+
90
+ Let's start by examining the polynomials that are written on the board after each turn.
91
+
92
+ 1. On turn 0, Susana writes the polynomial \( P_0(x) = n_0 \), where \( n_0 \) is a positive integer. This polynomial is a constant polynomial with no integer roots unless \( n_0 = 0 \), which is not allowed since \( n_0 \) is a positive integer.
93
+
94
+ 2. On turn 1, Brenda chooses a positive integer \( n_1 \) different from \( n_0 \) and writes either \( P_1(x) = n_1 x + P_0(x) = n_1 x + n_0 \) or \( P_1(x) = n_1 x - P_0(x) = n_1 x - n_0 \). In both cases, \( P_1(x) \) is a linear polynomial. A linear polynomial \( ax + b \) has an integer root if and only if \( b/a \) is an integer. Therefore, Brenda can choose \( n_1 \) and \( n_0 \) such that \( n_1 n_0 \) is divisible by \( n_1 \), which is always possible since \( n_1 \) and \( n_0 \) are different positive integers. For example, if \( n_1 = 2 \) and \( n_0 = 2 \), then \( P_1(x) = 2x - 2 \) has the integer root \( x = 1 \). However, since \( n_1 \) must be different from \( n_0 \), we need to consider the next turn.
95
+
96
+ 3. On turn 2, Susana chooses a positive integer \( n_2 \) different from \( n_0 \) and \( n_1 \) and writes either \( P_2(x) = n_2 x^2 + P_1(x) \) or \( P_2(x) = n_2 x^2 - P_1(x) \). To ensure a win, Susana should choose \( n_2 \) such that the resulting polynomial has an integer root. Notice that if \( P_1(x) = n_1 x + n_0 \) or \( P_1(x) = n_1 x - n_0 \), then \( P_2(x) = n_2 x^2 + P_1(x) \) or \( P_2(x) = n_2 x^2 - P_1(x) \) will have an integer root if \( n_2 \) is chosen appropriately. For example, if \( P_1(x) = 2x - 2 \), then \( P_2(x) = n_2 x^2 + 2x - 2 \) or \( P_2(x) = n_2 x^2 - 2x + 2 \) will have an integer root if \( n_2 \) is chosen such that the polynomial can be factored to have an integer root.
97
+
98
+ By following this strategy, Susana can always choose \( n_2 \) such that the resulting polynomial has an integer root. Therefore, Susana has a winning strategy.
99
+
100
+ The winning strategy for Susana is to choose \( n_2 \) such that the resulting polynomial has an integer root. Specifically, if \( P_1(x) = n_1 x + n_0 \) or \( P_1(x) = n_1 x - n_0 \), then Susana should choose \( n_2 \) such that \( P_2(x) = n_2 x^2 + P_1(x) \) or \( P_2(x) = n_2 x^2 - P_1(x) \) has an integer root.
101
+
102
+ The answer is: \(\boxed{\text{Susana has a winning strategy.}}\)
103
+
104
+ </example math solution>
105
+
106
+
107
+ ## Student Final Answer
108
+ \text{Susana has a winning strategy.}
109
+
110
+ ## Equivalence Judgement
111
+ FALSE
112
+
113
+ ## Justification
114
+ The student's answer states that "Susana has a winning strategy," which is not in agreement with the reference answer indicating "No winning strategy; the game can last indefinitely." The essence of the problem is that neither player can guarantee a win because the game can go on indefinitely without either player being able to write a polynomial with an integer root. The student's conclusion contradicts this, suggesting that a specific winning strategy exists for Susana, which is fundamentally incorrect according to the problem's conditions. Thus, the student's answer does not capture the same meaning as the reference answer.
115
+
116
+
117
+ === report over ===
118
+
119
+ <example math solution>
120
+ **Question**:
121
+ Determine if there exists a (three-variable) polynomial $P(x,y,z)$ with integer coefficients satisfying the following property: a positive integer $n$ is [i]not[/i] a perfect square if and only if there is a triple $(x,y,z)$ of positive integers such that $P(x,y,z) = n$.
122
+
123
+ **Reference Answer**
124
+ P(x,y,z) = x^2 + y^2 + z^2 + 2xyz
125
+
126
+ **Student Solution**:
127
+ **Claim:** There exists a three-variable polynomial $P(x, y
128
+
129
+ </example math solution>
130
+
131
+ ## Student Final Answer
132
+ None
133
+
134
+ ## Equivalence Judgement
135
+ FALSE
136
+
137
+ ## Justification
138
+ The student's answer does not provide a final conclusion and the final answer is missing, which indicates a lack of conclusion.
139
+
140
+ === report over ===
141
+
142
+ <math solution>
143
+ **Question**:
144
+ {{Problem}}
145
+
146
+ **Reference Answer**
147
+ {{Reference Answer}}
148
+
149
+ **Student Solution**:
150
+ {{Solution}}
151
+
152
+ </math solution>