crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -55,6 +55,7 @@ import os
55
55
  import sys
56
56
  from typing import List, Dict, Iterable, Optional, cast
57
57
 
58
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
58
59
  from helm.common.general import ensure_file_downloaded
59
60
  from helm.common.hierarchical_logger import hlog
60
61
  from helm.benchmark.scenarios.code_scenario_helper import run as run_reindent
@@ -69,6 +70,7 @@ from helm.benchmark.scenarios.scenario import (
69
70
  CORRECT_TAG,
70
71
  Input,
71
72
  Output,
73
+ ScenarioMetadata,
72
74
  )
73
75
 
74
76
 
@@ -331,3 +333,29 @@ class CodeScenario(Scenario):
331
333
  raise ValueError(f"Unknown dataset: {self.dataset}")
332
334
 
333
335
  return cast(List[Instance], instances)
336
+
337
+ def get_metadata(self) -> ScenarioMetadata:
338
+ if self.dataset == "humaneval":
339
+ return ScenarioMetadata(
340
+ name="code_humaneval",
341
+ display_name="HumanEval (Code)",
342
+ description="The HumanEval benchmark for measuring functional correctness for synthesizing "
343
+ "programs from docstrings [(Chen et al., "
344
+ "2021)](https://arxiv.org/pdf/2107.03374.pdf).",
345
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
346
+ main_metric="pass",
347
+ main_split="test",
348
+ )
349
+ elif self.dataset == "apps":
350
+ return ScenarioMetadata(
351
+ name="code_apps",
352
+ display_name="APPS (Code)",
353
+ description="The APPS benchmark for measuring competence on code challenges [(Hendrycks et "
354
+ "al., "
355
+ "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/c24cd76e1ce41366a4bbe8a49b02a028-Abstract-round2.html).",
356
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
357
+ main_metric="test_avg",
358
+ main_split="test",
359
+ )
360
+ else:
361
+ raise Exception(f"Unknown dataset {self.dataset}")
@@ -0,0 +1,197 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+ class CodeInsightsCodeEfficiencyScenario(Scenario):
7
+ name = "codeinsights_code_efficiency"
8
+ description = "Evaluate runtime efficiency alignment between LLM-generated code and student code"
9
+ tags = ["codeinsights", "c++", "code_efficiency"]
10
+
11
+ def __init__(self, num_testcases: int = 1):
12
+ super().__init__()
13
+ self.num_testcases = num_testcases
14
+
15
+ def get_instances(self, output_path: str):
16
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario4_data.csv")
17
+
18
+ # Load test cases (unit tests)
19
+ test_cases = self._load_test_cases()
20
+
21
+ # Get available question IDs with test cases
22
+ available_question_ids = set()
23
+ if test_cases:
24
+ available_question_ids = set(test_cases.keys())
25
+ print(f"Loaded test cases for {len(available_question_ids)} questions")
26
+ else:
27
+ print("WARNING: No test cases loaded!")
28
+ return []
29
+
30
+ instances = []
31
+ skipped_no_tests = 0
32
+ skipped_insufficient_data = 0
33
+
34
+ for student_id, student_df in df.groupby("student_id"):
35
+ student_df = student_df.sort_values("timestamp")
36
+ if len(student_df) < 4:
37
+ skipped_insufficient_data += 1
38
+ continue
39
+
40
+ first = student_df.iloc[0]
41
+ second = student_df.iloc[1]
42
+ third = student_df.iloc[2]
43
+ target = student_df.iloc[3]
44
+
45
+ # Check if target question has test cases BEFORE processing
46
+ target_question_id = target.get("question_unittest_id", None)
47
+ if not target_question_id or str(target_question_id) not in available_question_ids:
48
+ skipped_no_tests += 1
49
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
50
+ continue
51
+
52
+ # Get test cases for this question (we know they exist now)
53
+ question_test_cases = []
54
+ tc_parsing_success = True
55
+
56
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
57
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
58
+ input_idx = testcase_str.find("Input:")
59
+ std_in_idx = testcase_str.find("STD input:")
60
+ output_idx = testcase_str.find("Output:")
61
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
62
+ tc_parsing_success = False
63
+ break
64
+
65
+ testcase = {
66
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
67
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
68
+ "output": testcase_str[output_idx + 7 :].strip(),
69
+ }
70
+ question_test_cases.append(testcase)
71
+
72
+ if not tc_parsing_success:
73
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
74
+ continue
75
+
76
+ if len(question_test_cases) < self.num_testcases:
77
+ # If not enough test cases, skip this question
78
+ continue
79
+ if self.num_testcases >= 0:
80
+ # If more than one test case is requested, only take the first ones
81
+ question_test_cases = question_test_cases[: self.num_testcases]
82
+
83
+ # Get student pass pattern for the target question
84
+ student_correctness_pattern = target.get("pass", None)
85
+ if student_correctness_pattern is not None:
86
+ main_part = int(student_correctness_pattern)
87
+ # Convert each character to an int
88
+ student_correctness_list = [int(ch) for ch in str(main_part)]
89
+ else:
90
+ student_correctness_list = []
91
+
92
+ print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
93
+ print(f"Test cases loaded: {len(question_test_cases)}")
94
+ print(f"Student correctness pattern: {student_correctness_list}")
95
+ print(f"Question name: {target.get('question_name', 'MISSING')}")
96
+
97
+ prompt = (
98
+ f"Week: {target['week']}\n"
99
+ f"Topic: {target['topic']}\n\n"
100
+ "Example 1:\n"
101
+ f"Question: {first['question_name']} — {first['question_text']}\n"
102
+ "Template:\n"
103
+ f"{first['question_template']}\n"
104
+ "Your Code:\n"
105
+ f"{first['response']}\n\n"
106
+ "Example 2:\n"
107
+ f"Question: {second['question_name']} — {second['question_text']}\n"
108
+ "Template:\n"
109
+ f"{second['question_template']}\n"
110
+ "Your Code:\n"
111
+ f"{second['response']}\n\n"
112
+ "Example 3:\n"
113
+ f"Question: {third['question_name']} — {third['question_text']}\n"
114
+ "Template:\n"
115
+ f"{third['question_template']}\n"
116
+ "Your Code:\n"
117
+ f"{third['response']}\n\n"
118
+ "Now, using that same student's coding style, attempt this:\n"
119
+ "Ensure that the code works perfectly, but its efficiency should be based on students' past examples.\n"
120
+ "If a student has a tendency to write correct but inefficient code, imitate the inefficiency "
121
+ "but if they write efficiently, write efficiently too.\n"
122
+ f"Question: {target['question_name']} — {target['question_text']}\n\n"
123
+ f"Unit Test Input: {question_test_cases}\n\n"
124
+ if question_test_cases
125
+ else ""
126
+ "Template:\n"
127
+ f"{target['question_template']}\n\n"
128
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
129
+ "DO NOT reproduce the template part as the generated code would be inserted to the template, "
130
+ "and make sure the code is compatible with the Unit Test Input. "
131
+ "int main() is always declared already so DO NOT produce that initialization on the code. "
132
+ "Ensure your code is correct, includes any class definition when needed, and handles all edge cases properly. "
133
+ "Return the code in C++ code block format, and nothing else, and produce only one set of code."
134
+ )
135
+
136
+ instances.append(
137
+ Instance(
138
+ id=f"{student_id}_{target['question_unittest_id']}",
139
+ input=Input(text=prompt),
140
+ references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
141
+ extra_data={
142
+ "question_template": target["question_template"],
143
+ "test_cases": question_test_cases,
144
+ "question_id": str(target_question_id),
145
+ "question_name": target.get("question_name", ""),
146
+ "student_id": str(student_id),
147
+ "student_correctness_pattern": student_correctness_list,
148
+ },
149
+ split=VALID_SPLIT,
150
+ )
151
+ )
152
+
153
+ # Print summary statistics
154
+ print("\n=== INSTANCE CREATION SUMMARY ===")
155
+ print(f"Total instances created: {len(instances)}")
156
+ print(f"Skipped (insufficient data): {skipped_insufficient_data}")
157
+ print(f"Skipped (no test cases): {skipped_no_tests}")
158
+ print(f"Available test case question IDs: {len(available_question_ids)}")
159
+
160
+ if instances:
161
+ print("Sample created instances:")
162
+ for i, inst in enumerate(instances[:5]):
163
+ if inst.extra_data is None:
164
+ test_count = 0
165
+ else:
166
+ test_count = len(inst.extra_data.get("test_cases", []))
167
+ print(f" {inst.id}: {test_count} test cases")
168
+
169
+ return instances
170
+
171
+ def _load_test_cases(self):
172
+ """
173
+ Load test cases from external source or return None if not available.
174
+ This method should be implemented based on where your test cases are stored.
175
+
176
+ Expected format:
177
+ {
178
+ "question_id": [
179
+ {
180
+ "unittest": "test_id",
181
+ "input": "test input code",
182
+ "output": "expected output"
183
+ },
184
+ ...
185
+ ],
186
+ ...
187
+ }
188
+ """
189
+ try:
190
+ response = requests.get(
191
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
192
+ )
193
+ if response.status_code == 200:
194
+ return response.json()
195
+ except Exception as e:
196
+ print(f"Failed to load test cases from URL: {e}")
197
+ return {}
@@ -0,0 +1,78 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, VALID_SPLIT
2
+ import pandas as pd
3
+
4
+
5
+ class CodeInsightsCorrectCodeScenario(Scenario):
6
+ name = "codeinsights_correct_code"
7
+ description = "Generate correct response code for C++ programming questions"
8
+ tags = ["codeinsights", "c++", "correct_code"]
9
+
10
+ def __init__(self, num_testcases: int = 1):
11
+ super().__init__()
12
+ self.num_testcases = num_testcases
13
+
14
+ def get_instances(self, output_path: str):
15
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario1_2_data.csv")
16
+
17
+ # Load test cases (unit tests)
18
+ instances = []
19
+ for question_id, question_df in df.groupby("question_unittest_id"):
20
+ target = question_df.iloc[0]
21
+ question_test_cases = []
22
+ tc_parsing_success = True
23
+
24
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
25
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
26
+ input_idx = testcase_str.find("Input:")
27
+ std_in_idx = testcase_str.find("STD input:")
28
+ output_idx = testcase_str.find("Output:")
29
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
30
+ tc_parsing_success = False
31
+ break
32
+
33
+ testcase = {
34
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
35
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
36
+ "output": testcase_str[output_idx + 7 :].strip(),
37
+ }
38
+ question_test_cases.append(testcase)
39
+
40
+ if not tc_parsing_success:
41
+ continue
42
+
43
+ if len(question_test_cases) < self.num_testcases:
44
+ # If not enough test cases, skip this question
45
+ continue
46
+ if self.num_testcases >= 0:
47
+ # If more than one test case is requested, only take the first ones
48
+ question_test_cases = question_test_cases[: self.num_testcases]
49
+
50
+ prompt = (
51
+ f"Question: {target['question_name']} — {target['question_text']}\n\n"
52
+ f"Unit Test Input: {question_test_cases}\n\n"
53
+ if question_test_cases
54
+ else ""
55
+ "Template:\n"
56
+ f"{target['question_template']}\n\n"
57
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
58
+ "DO NOT reproduce the template part as the generated code would be inserted to the template, "
59
+ "and make sure the code is compatible with the Unit Test Input. "
60
+ "int main() is always declared already so DO NOT produce that initialization on the code. "
61
+ "Ensure your code is correct, efficient, includes any class definition when needed, and handles all edge cases properly. "
62
+ "Return the code in C++ code block format, and nothing else."
63
+ )
64
+ instances.append(
65
+ Instance(
66
+ id=f"{question_id}",
67
+ input=Input(text=prompt),
68
+ references=[],
69
+ extra_data={
70
+ "question_template": target["question_template"],
71
+ "test_cases": question_test_cases,
72
+ "question_id": str(question_id) if question_id else None,
73
+ "question_name": target.get("question_name", ""),
74
+ },
75
+ split=VALID_SPLIT,
76
+ )
77
+ )
78
+ return instances
@@ -0,0 +1,192 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+ class CodeInsightsEdgeCaseScenario(Scenario):
7
+ name = "codeinsights_edge_case"
8
+ description = "Evaluate alignment in edge case failure between LLM-generated code and student code"
9
+ tags = ["codeinsights", "c++", "edge_case"]
10
+
11
+ def __init__(self, num_testcases: int = 1):
12
+ super().__init__()
13
+ self.num_testcases = num_testcases
14
+
15
+ def get_instances(self, output_path: str):
16
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario5_data.csv")
17
+
18
+ student_topic = pd.read_csv(
19
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
20
+ )
21
+
22
+ # Load test cases (unit tests)
23
+ test_cases = self._load_test_cases()
24
+
25
+ # Get available question IDs with test cases
26
+ available_question_ids = set()
27
+ if test_cases:
28
+ available_question_ids = set(test_cases.keys())
29
+ print(f"Loaded test cases for {len(available_question_ids)} questions")
30
+ else:
31
+ print("WARNING: No test cases loaded!")
32
+ return []
33
+
34
+ instances = []
35
+ skipped_no_tests = 0
36
+ skipped_insufficient_data = 0
37
+
38
+ for student_id, student_df in df.groupby("student_id"):
39
+ student_df = student_df.sort_values("timestamp")
40
+ target = student_df.iloc[0]
41
+
42
+ # Check if target question has test cases BEFORE processing
43
+ target_question_id = target.get("question_unittest_id", None)
44
+ if not target_question_id or str(target_question_id) not in available_question_ids:
45
+ skipped_no_tests += 1
46
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
47
+ continue
48
+
49
+ # Get test cases for this question (we know they exist now)
50
+ target_test_cases = []
51
+ tc_parsing_success = True
52
+
53
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
54
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
55
+ input_idx = testcase_str.find("Input:")
56
+ std_in_idx = testcase_str.find("STD input:")
57
+ output_idx = testcase_str.find("Output:")
58
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
59
+ tc_parsing_success = False
60
+ break
61
+
62
+ testcase = {
63
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
64
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
65
+ "output": testcase_str[output_idx + 7 :].strip(),
66
+ }
67
+ target_test_cases.append(testcase)
68
+
69
+ # Verify test cases are not empty
70
+ if not tc_parsing_success:
71
+ skipped_no_tests += 1
72
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
73
+ continue
74
+
75
+ if len(target_test_cases) < self.num_testcases:
76
+ # If not enough test cases, skip this question
77
+ continue
78
+ if self.num_testcases >= 0:
79
+ # If more than one test case is requested, only take the first ones
80
+ target_test_cases = target_test_cases[: self.num_testcases]
81
+
82
+ # Get student pass pattern for the target question
83
+ student_correctness_pattern = target.get("pass", None)
84
+ if student_correctness_pattern is not None:
85
+ main_part = int(student_correctness_pattern)
86
+ # Convert each character to an int
87
+ student_correctness_list = [int(ch) for ch in str(main_part)]
88
+ else:
89
+ student_correctness_list = []
90
+
91
+ # Student specific topic performance in previous attempts
92
+ student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
93
+ topic_performance = student_topic[student_topic["student_id"] == student_id]
94
+ for _, row in topic_performance.iterrows():
95
+ topic = row["topic"]
96
+ pass_rate = round(row["pass_rate"], 2)
97
+ perfect = round(row["perfect"], 2)
98
+
99
+ student_level_prompt += (
100
+ f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
101
+ f"and the rate of passing all unit tests is {perfect}.\n"
102
+ )
103
+
104
+ print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
105
+ print(f"Test cases loaded: {len(target_test_cases)}")
106
+ print(f"Student correctness pattern: {student_correctness_list}")
107
+ print(f"Question name: {target.get('question_name', 'MISSING')}")
108
+
109
+ prompt = (
110
+ "You are analyzing a student’s likely mistakes on an upcoming programming problem.\n"
111
+ "Your task: **predict exactly ONE unit-test index (0-based) that the student is most likely to fail.**\n" # noqa: E501
112
+ "Return *only* that integer. No explanation.\n\n"
113
+ "=== Student Profile ===\n"
114
+ f"{student_level_prompt}\n"
115
+ "For the given programming question, identify which unit test the student would fail considering "
116
+ "their past performance, as well as consideration of unit test difficulty.\n"
117
+ f"Week: {target['week']}\n"
118
+ f"Topic: {target['topic']}\n\n"
119
+ f"Question: {target['question_name']} — {target['question_text']}\n"
120
+ f"Unit Tests: {target_test_cases}\n"
121
+ if target_test_cases
122
+ else ""
123
+ "Think silently about:\n"
124
+ "• Which test seems hardest for the given topic?\n"
125
+ "• Where has the student historically struggled?\n"
126
+ "• Any edge-cases in the tests’ inputs/outputs?\n\n"
127
+ "******** ANSWER FORMAT (STRICT) ********\n"
128
+ "<integer>\n"
129
+ "****************************************"
130
+ )
131
+
132
+ instances.append(
133
+ Instance(
134
+ id=f"{student_id}_{target['question_unittest_id']}",
135
+ input=Input(text=prompt),
136
+ references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
137
+ extra_data={
138
+ "question_template": target["question_template"],
139
+ "test_cases": target_test_cases,
140
+ "question_id": str(target_question_id),
141
+ "question_name": target.get("question_name", ""),
142
+ "student_id": str(student_id),
143
+ "student_correctness_pattern": student_correctness_list,
144
+ },
145
+ split=VALID_SPLIT,
146
+ )
147
+ )
148
+
149
+ # Print summary statistics
150
+ print("\n=== INSTANCE CREATION SUMMARY ===")
151
+ print(f"Skipped (insufficient data): {skipped_insufficient_data}")
152
+ print(f"Skipped (no test cases): {skipped_no_tests}")
153
+ print(f"Available test case question IDs: {len(available_question_ids)}")
154
+
155
+ if len(instances) >= 5:
156
+ print("Sample created instances:")
157
+ for i, inst in enumerate(instances[:5]):
158
+ if inst.extra_data is None:
159
+ test_count = 0
160
+ else:
161
+ test_count = len(inst.extra_data.get("test_cases", []))
162
+ print(f" {inst.id}: {test_count} test cases")
163
+
164
+ return instances
165
+
166
+ def _load_test_cases(self):
167
+ """
168
+ Load test cases from external source or return None if not available.
169
+ This method should be implemented based on where your test cases are stored.
170
+
171
+ Expected format:
172
+ {
173
+ "question_id": [
174
+ {
175
+ "unittest": "test_id",
176
+ "input": "test input code",
177
+ "output": "expected output"
178
+ },
179
+ ...
180
+ ],
181
+ ...
182
+ }
183
+ """
184
+ try:
185
+ response = requests.get(
186
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
187
+ )
188
+ if response.status_code == 200:
189
+ return response.json()
190
+ except Exception as e:
191
+ print(f"Failed to load test cases from URL: {e}")
192
+ return {}
@@ -0,0 +1,162 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+ class CodeInsightsStudentCodingScenario(Scenario):
7
+ name = "codeinsights_student_coding"
8
+ description = "Mimic student C++ style on foundational questions"
9
+ tags = ["codeinsights", "c++", "student_coding"]
10
+
11
+ def __init__(self, num_testcases: int = 1):
12
+ super().__init__()
13
+ self.num_testcases = num_testcases
14
+
15
+ def get_instances(self, output_path: str):
16
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario1_2_data.csv")
17
+ student_topic = pd.read_csv(
18
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
19
+ )
20
+
21
+ instances = []
22
+ for student_id, student_df in df.groupby("student_id"):
23
+ student_df = student_df.sort_values("timestamp")
24
+ if len(student_df) < 4:
25
+ continue
26
+ first = student_df.iloc[0]
27
+ second = student_df.iloc[1]
28
+ third = student_df.iloc[2]
29
+ target = student_df.iloc[3]
30
+
31
+ # Get test cases for this question
32
+ question_id = target.get("question_unittest_id", None)
33
+ question_test_cases = []
34
+ tc_parsing_success = True
35
+
36
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
37
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
38
+ input_idx = testcase_str.find("Input:")
39
+ std_in_idx = testcase_str.find("STD input:")
40
+ output_idx = testcase_str.find("Output:")
41
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
42
+ tc_parsing_success = False
43
+ break
44
+
45
+ testcase = {
46
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
47
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
48
+ "output": testcase_str[output_idx + 7 :].strip(),
49
+ }
50
+ question_test_cases.append(testcase)
51
+
52
+ if not tc_parsing_success:
53
+ continue
54
+
55
+ if len(question_test_cases) < self.num_testcases:
56
+ # If not enough test cases, skip this question
57
+ continue
58
+ if self.num_testcases >= 0:
59
+ # If more than one test case is requested, only take the first ones
60
+ question_test_cases = question_test_cases[: self.num_testcases]
61
+
62
+ # Get student pass (0 or 1) for the target question
63
+ student_correctness_pattern = target.get("pass", None)
64
+ main_part = int(student_correctness_pattern) # "1111111111"
65
+ # Convert each character to an int
66
+ student_correctness_list = [int(ch) for ch in str(main_part)] # [1,1,1,1,1,1,1,1,1,1]
67
+
68
+ # Student specific topic performance in previous attempts
69
+ student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
70
+ topic_performance = student_topic[student_topic["student_id"] == student_id]
71
+ for _, row in topic_performance.iterrows():
72
+ topic = row["topic"]
73
+ pass_rate = round(row["pass_rate"], 2)
74
+ perfect = round(row["perfect"], 2)
75
+
76
+ student_level_prompt += (
77
+ f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
78
+ f"and the rate of passing all unit tests is {perfect}.\n"
79
+ )
80
+
81
+ prompt = (
82
+ "=== Student Profile ===\n"
83
+ f"{student_level_prompt}\n"
84
+ f"Week: {target['week']}\n"
85
+ f"Topic: {target['topic']}\n\n"
86
+ "Example 1:\n"
87
+ f"Question: {first['question_name']} — {first['question_text']}\n"
88
+ "Template:\n"
89
+ f"{first['question_template']}\n"
90
+ "Your Code:\n"
91
+ f"{first['response']}\n\n"
92
+ "Example 2:\n"
93
+ f"Question: {second['question_name']} — {second['question_text']}\n"
94
+ "Template:\n"
95
+ f"{second['question_template']}\n"
96
+ "Your Code:\n"
97
+ f"{second['response']}\n\n"
98
+ "Example 3:\n"
99
+ f"Question: {third['question_name']} — {third['question_text']}\n"
100
+ "Template:\n"
101
+ f"{third['question_template']}\n"
102
+ "Your Code:\n"
103
+ f"{third['response']}\n\n"
104
+ "Now, using that same student style, attempt this:\n"
105
+ f"Question: {target['question_name']} — {target['question_text']}\n"
106
+ f"Unit Test Input: {question_test_cases}\n\n"
107
+ if question_test_cases
108
+ else ""
109
+ "Template:\n"
110
+ f"{target['question_template']}\n\n"
111
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
112
+ "DO NOT reproduce the template part as the generated code would be inserted to the template, "
113
+ "and make sure the code is compatible with the Unit Test Input. "
114
+ "int main() is always declared already so DO NOT produce that initialization on the code. "
115
+ "Ensure your code includes any class definition when needed. "
116
+ "Return the code in C++ code block format, and nothing else."
117
+ )
118
+ instances.append(
119
+ Instance(
120
+ id=f"{student_id}_{target['question_unittest_id']}",
121
+ input=Input(text=prompt),
122
+ references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
123
+ extra_data={
124
+ "question_template": target["question_template"],
125
+ "test_cases": question_test_cases,
126
+ "question_id": str(question_id) if question_id else None,
127
+ "question_name": target.get("question_name", ""),
128
+ "student_id": str(student_id),
129
+ "student_correctness_pattern": student_correctness_list,
130
+ },
131
+ split=VALID_SPLIT,
132
+ )
133
+ )
134
+ return instances
135
+
136
+ def _load_test_cases(self):
137
+ """
138
+ Load test cases from external source or return None if not available.
139
+ This method should be implemented based on where your test cases are stored.
140
+
141
+ Expected format:
142
+ {
143
+ "question_id": [
144
+ {
145
+ "unittest": "test_id",
146
+ "input": "test input code",
147
+ "output": "expected output"
148
+ },
149
+ ...
150
+ ],
151
+ ...
152
+ }
153
+ """
154
+ try:
155
+ response = requests.get(
156
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
157
+ )
158
+ if response.status_code == 200:
159
+ return response.json()
160
+ except Exception as e:
161
+ print(f"Failed to load test cases from URL: {e}")
162
+ return {}