crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -12,7 +12,7 @@ from helm.benchmark.scenarios.scenario import (
12
12
  Output,
13
13
  )
14
14
  from helm.common.general import ensure_directory_exists
15
- from helm.common.hierarchical_logger import hlog
15
+ from helm.common.hierarchical_logger import hwarn
16
16
 
17
17
 
18
18
  class AutoBencherCapabilitiesScenario(Scenario):
@@ -61,7 +61,7 @@ class AutoBencherCapabilitiesScenario(Scenario):
61
61
  # References are category ID, followed by level 2, 3 and 4 category names.
62
62
  references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
63
63
  if row["gold_answer"] is None:
64
- hlog(f"WARNING: Row had no gold_answer: {row}")
64
+ hwarn(f"Row had no gold_answer: {row}")
65
65
  continue
66
66
  instance = Instance(input=input, references=references, split=TEST_SPLIT)
67
67
  instances.append(instance)
@@ -0,0 +1,66 @@
1
+ from typing import Any, List
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ CORRECT_TAG,
10
+ Input,
11
+ Output,
12
+ )
13
+
14
+
15
+ class BLUEX_Scenario(Scenario):
16
+ """
17
+ The BLUEX dataset is a benchmark used for evaluating natural language processing models in Brazilian Portuguese.
18
+ It consists of multiple-choice questions taken from official entrance exams of Unicamp (Convest) and USP (Fuvest),
19
+ covering various high school subjects. The questions include both textual prompts and visual elements. This dataset
20
+ was developed to assess the performance of models on tasks involving comprehension and reasoning, with a specific
21
+ focus on texts and exams originally written in Portuguese.
22
+ """
23
+
24
+ name = "bluex"
25
+ description = "MQA benchmark with questions from Brazilian entrance exams"
26
+ tags = ["knowledge", "multiple_choice", "pt-br"]
27
+
28
+ def get_instances(self, output_path: str) -> List[Instance]:
29
+ # Download the raw data and read all the dialogues
30
+ dataset: Any
31
+ # Read all the instances
32
+ instances: List[Instance] = []
33
+ cache_dir = str(Path(output_path) / "data")
34
+
35
+ dataset = load_dataset("portuguese-benchmark-datasets/BLUEX", cache_dir=cache_dir)
36
+ for example in dataset["questions"]:
37
+ # This scenario disregards issues with images
38
+ if example["has_associated_images"]:
39
+ continue
40
+ question = example["question"]
41
+ choices = example["alternatives"]
42
+ answer = example["answer"]
43
+
44
+ answers_dict = {}
45
+ for alt in choices:
46
+ if ")" in alt:
47
+ label, text = alt.split(")", 1)
48
+ label = label.strip().upper()
49
+ text = text.strip()
50
+ answers_dict[label] = text
51
+
52
+ if answer not in answers_dict:
53
+ continue
54
+
55
+ correct_answer = answers_dict[answer]
56
+
57
+ def answer_to_reference(answer: str) -> Reference:
58
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
59
+
60
+ instance = Instance(
61
+ input=Input(text=question),
62
+ split=TEST_SPLIT,
63
+ references=[answer_to_reference(text) for text in answers_dict.values()],
64
+ )
65
+ instances.append(instance)
66
+ return instances
@@ -1,8 +1,7 @@
1
- import os
2
1
  import pandas as pd
3
2
  from typing import List
4
3
 
5
- from helm.common.general import ensure_directory_exists
4
+ from helm.common.general import check_file_exists
6
5
  from helm.benchmark.scenarios.scenario import (
7
6
  Input,
8
7
  Scenario,
@@ -61,28 +60,30 @@ Clinical Note:
61
60
 
62
61
  class CHWCarePlanScenario(Scenario):
63
62
  """
64
- A scenario for MIMIC-IV discharge summaries where the task is to predict the ICD-10 code(s).
63
+ A scenario for a dataset containing free form text of a clinical health worker care plan, with the
64
+ associated goal being to restructure that text into a given format.
65
65
 
66
66
  - Input: The clinical note (column "MO Note").
67
67
  - Output: The clinical note (column "MO Note"). We will use this note as the reference for entailment.
68
68
  """
69
69
 
70
70
  name = "chw_care_plan"
71
- description = "A dataset containing free form text of a clinical health worker care plan, with the \
72
- associated goal being to restructure that text into a given format."
71
+ description = (
72
+ "NoteExtract is a benchmark that focuses on the structured extraction of information"
73
+ "from free-form clinical text. It provides care plan notes authored by health workers"
74
+ "and evaluates a model's ability to convert them into a predefined structured format,"
75
+ "such as fields for Chief Complaint and History of Present Illness. The benchmark"
76
+ "emphasizes faithful extraction without hallucination or inference."
77
+ )
73
78
  tags = ["question_answering", "biomedical"]
74
79
 
75
- def __init__(self):
76
- """
77
- :param data_file: Path to the mimiciv_icd10.feather file.
78
- """
80
+ def __init__(self, data_path: str):
79
81
  super().__init__()
80
- self.data_file = "/share/pi/nigam/datasets/CHW_Dataset.csv"
82
+ self.data_path = data_path
81
83
 
82
84
  def get_instances(self, output_path: str) -> List[Instance]:
83
- ensure_directory_exists(os.path.dirname(self.data_file))
84
-
85
- df = pd.read_csv(self.data_file) # columns: ["text", "target", ...]
85
+ check_file_exists(self.data_path, msg=f"[CHWCarePlanScenario] Required data file not found: '{self.data_path}'")
86
+ df = pd.read_csv(self.data_path) # columns: ["text", "target", ...]
86
87
 
87
88
  instances: List[Instance] = []
88
89
 
@@ -2,7 +2,7 @@ import os
2
2
  import pandas as pd
3
3
  from typing import List
4
4
 
5
- from helm.common.general import ensure_directory_exists
5
+ from helm.common.general import check_file_exists
6
6
  from helm.benchmark.scenarios.scenario import (
7
7
  Input,
8
8
  Scenario,
@@ -76,16 +76,21 @@ class CLEARScenario(Scenario):
76
76
  "unemployment": "unemployment",
77
77
  }
78
78
 
79
- def __init__(self, condition: str):
79
+ def __init__(self, condition: str, data_path: str):
80
80
  """Initialize the scenario with a specific medical condition"""
81
81
  super().__init__()
82
82
 
83
83
  if condition not in self.CONDITIONS:
84
84
  raise ValueError(f"Condition '{condition}' not supported. Available conditions: {self.CONDITIONS}")
85
-
85
+ self.data_path = data_path
86
86
  self.condition = condition
87
87
  self.name = f"clear_{condition}"
88
- self.description = f"A dataset for evaluating {self.CONDITION_PROMPTS[condition]} detection from patient notes with yes/no/maybe classifications." # noqa: E501
88
+ self.description = (
89
+ "CLEAR is a benchmark designed to evaluate models on their ability to detect medical"
90
+ "conditions from patient notes using categorical responses. Each instance consists of"
91
+ "a clinical note and a target condition, requiring the model to classify the patient's"
92
+ "history as either affirmative, negative, or uncertain."
93
+ ) # noqa: E501
89
94
  self.tags = ["classification", "biomedical", condition.replace("_", "-")]
90
95
 
91
96
  def get_answer_choices(self) -> List[str]:
@@ -95,9 +100,8 @@ class CLEARScenario(Scenario):
95
100
 
96
101
  def get_instances(self, output_path: str) -> List[Instance]:
97
102
  """Load and process the data for the specified conditon."""
98
- data_dir = "/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/"
99
- excel_path = os.path.join(data_dir, f"{self.condition}.xlsx")
100
- ensure_directory_exists(os.path.dirname(excel_path))
103
+ excel_path = os.path.join(self.data_path, f"{self.condition}.xlsx")
104
+ check_file_exists(excel_path, msg=f"[CLEARScenario] Required data file not found: '{excel_path}'")
101
105
 
102
106
  df = pd.read_excel(excel_path)
103
107
 
@@ -1549,7 +1549,7 @@ class CLEVALanguageModelingScenario(CLEVAScenario):
1549
1549
 
1550
1550
 
1551
1551
  class CLEVACodeSynthesisScenario(CLEVAScenario):
1552
- """
1552
+ r"""
1553
1553
  The code synthesis task of CLEVA benchmark.
1554
1554
 
1555
1555
  An example is:
@@ -0,0 +1,197 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+ class CodeInsightsCodeEfficiencyScenario(Scenario):
7
+ name = "codeinsights_code_efficiency"
8
+ description = "Evaluate runtime efficiency alignment between LLM-generated code and student code"
9
+ tags = ["codeinsights", "c++", "code_efficiency"]
10
+
11
+ def __init__(self, num_testcases: int = 1):
12
+ super().__init__()
13
+ self.num_testcases = num_testcases
14
+
15
+ def get_instances(self, output_path: str):
16
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario4_data.csv")
17
+
18
+ # Load test cases (unit tests)
19
+ test_cases = self._load_test_cases()
20
+
21
+ # Get available question IDs with test cases
22
+ available_question_ids = set()
23
+ if test_cases:
24
+ available_question_ids = set(test_cases.keys())
25
+ print(f"Loaded test cases for {len(available_question_ids)} questions")
26
+ else:
27
+ print("WARNING: No test cases loaded!")
28
+ return []
29
+
30
+ instances = []
31
+ skipped_no_tests = 0
32
+ skipped_insufficient_data = 0
33
+
34
+ for student_id, student_df in df.groupby("student_id"):
35
+ student_df = student_df.sort_values("timestamp")
36
+ if len(student_df) < 4:
37
+ skipped_insufficient_data += 1
38
+ continue
39
+
40
+ first = student_df.iloc[0]
41
+ second = student_df.iloc[1]
42
+ third = student_df.iloc[2]
43
+ target = student_df.iloc[3]
44
+
45
+ # Check if target question has test cases BEFORE processing
46
+ target_question_id = target.get("question_unittest_id", None)
47
+ if not target_question_id or str(target_question_id) not in available_question_ids:
48
+ skipped_no_tests += 1
49
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
50
+ continue
51
+
52
+ # Get test cases for this question (we know they exist now)
53
+ question_test_cases = []
54
+ tc_parsing_success = True
55
+
56
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
57
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
58
+ input_idx = testcase_str.find("Input:")
59
+ std_in_idx = testcase_str.find("STD input:")
60
+ output_idx = testcase_str.find("Output:")
61
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
62
+ tc_parsing_success = False
63
+ break
64
+
65
+ testcase = {
66
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
67
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
68
+ "output": testcase_str[output_idx + 7 :].strip(),
69
+ }
70
+ question_test_cases.append(testcase)
71
+
72
+ if not tc_parsing_success:
73
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
74
+ continue
75
+
76
+ if len(question_test_cases) < self.num_testcases:
77
+ # If not enough test cases, skip this question
78
+ continue
79
+ if self.num_testcases >= 0:
80
+ # If more than one test case is requested, only take the first ones
81
+ question_test_cases = question_test_cases[: self.num_testcases]
82
+
83
+ # Get student pass pattern for the target question
84
+ student_correctness_pattern = target.get("pass", None)
85
+ if student_correctness_pattern is not None:
86
+ main_part = int(student_correctness_pattern)
87
+ # Convert each character to an int
88
+ student_correctness_list = [int(ch) for ch in str(main_part)]
89
+ else:
90
+ student_correctness_list = []
91
+
92
+ print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
93
+ print(f"Test cases loaded: {len(question_test_cases)}")
94
+ print(f"Student correctness pattern: {student_correctness_list}")
95
+ print(f"Question name: {target.get('question_name', 'MISSING')}")
96
+
97
+ prompt = (
98
+ f"Week: {target['week']}\n"
99
+ f"Topic: {target['topic']}\n\n"
100
+ "Example 1:\n"
101
+ f"Question: {first['question_name']} — {first['question_text']}\n"
102
+ "Template:\n"
103
+ f"{first['question_template']}\n"
104
+ "Your Code:\n"
105
+ f"{first['response']}\n\n"
106
+ "Example 2:\n"
107
+ f"Question: {second['question_name']} — {second['question_text']}\n"
108
+ "Template:\n"
109
+ f"{second['question_template']}\n"
110
+ "Your Code:\n"
111
+ f"{second['response']}\n\n"
112
+ "Example 3:\n"
113
+ f"Question: {third['question_name']} — {third['question_text']}\n"
114
+ "Template:\n"
115
+ f"{third['question_template']}\n"
116
+ "Your Code:\n"
117
+ f"{third['response']}\n\n"
118
+ "Now, using that same student's coding style, attempt this:\n"
119
+ "Ensure that the code works perfectly, but its efficiency should be based on students' past examples.\n"
120
+ "If a student has a tendency to write correct but inefficient code, imitate the inefficiency "
121
+ "but if they write efficiently, write efficiently too.\n"
122
+ f"Question: {target['question_name']} — {target['question_text']}\n\n"
123
+ f"Unit Test Input: {question_test_cases}\n\n"
124
+ if question_test_cases
125
+ else ""
126
+ "Template:\n"
127
+ f"{target['question_template']}\n\n"
128
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
129
+ "DO NOT reproduce the template part as the generated code would be inserted to the template, "
130
+ "and make sure the code is compatible with the Unit Test Input. "
131
+ "int main() is always declared already so DO NOT produce that initialization on the code. "
132
+ "Ensure your code is correct, includes any class definition when needed, and handles all edge cases properly. "
133
+ "Return the code in C++ code block format, and nothing else, and produce only one set of code."
134
+ )
135
+
136
+ instances.append(
137
+ Instance(
138
+ id=f"{student_id}_{target['question_unittest_id']}",
139
+ input=Input(text=prompt),
140
+ references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
141
+ extra_data={
142
+ "question_template": target["question_template"],
143
+ "test_cases": question_test_cases,
144
+ "question_id": str(target_question_id),
145
+ "question_name": target.get("question_name", ""),
146
+ "student_id": str(student_id),
147
+ "student_correctness_pattern": student_correctness_list,
148
+ },
149
+ split=VALID_SPLIT,
150
+ )
151
+ )
152
+
153
+ # Print summary statistics
154
+ print("\n=== INSTANCE CREATION SUMMARY ===")
155
+ print(f"Total instances created: {len(instances)}")
156
+ print(f"Skipped (insufficient data): {skipped_insufficient_data}")
157
+ print(f"Skipped (no test cases): {skipped_no_tests}")
158
+ print(f"Available test case question IDs: {len(available_question_ids)}")
159
+
160
+ if instances:
161
+ print("Sample created instances:")
162
+ for i, inst in enumerate(instances[:5]):
163
+ if inst.extra_data is None:
164
+ test_count = 0
165
+ else:
166
+ test_count = len(inst.extra_data.get("test_cases", []))
167
+ print(f" {inst.id}: {test_count} test cases")
168
+
169
+ return instances
170
+
171
+ def _load_test_cases(self):
172
+ """
173
+ Load test cases from external source or return None if not available.
174
+ This method should be implemented based on where your test cases are stored.
175
+
176
+ Expected format:
177
+ {
178
+ "question_id": [
179
+ {
180
+ "unittest": "test_id",
181
+ "input": "test input code",
182
+ "output": "expected output"
183
+ },
184
+ ...
185
+ ],
186
+ ...
187
+ }
188
+ """
189
+ try:
190
+ response = requests.get(
191
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
192
+ )
193
+ if response.status_code == 200:
194
+ return response.json()
195
+ except Exception as e:
196
+ print(f"Failed to load test cases from URL: {e}")
197
+ return {}
@@ -0,0 +1,78 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, VALID_SPLIT
2
+ import pandas as pd
3
+
4
+
5
+ class CodeInsightsCorrectCodeScenario(Scenario):
6
+ name = "codeinsights_correct_code"
7
+ description = "Generate correct response code for C++ programming questions"
8
+ tags = ["codeinsights", "c++", "correct_code"]
9
+
10
+ def __init__(self, num_testcases: int = 1):
11
+ super().__init__()
12
+ self.num_testcases = num_testcases
13
+
14
+ def get_instances(self, output_path: str):
15
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario1_2_data.csv")
16
+
17
+ # Load test cases (unit tests)
18
+ instances = []
19
+ for question_id, question_df in df.groupby("question_unittest_id"):
20
+ target = question_df.iloc[0]
21
+ question_test_cases = []
22
+ tc_parsing_success = True
23
+
24
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
25
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
26
+ input_idx = testcase_str.find("Input:")
27
+ std_in_idx = testcase_str.find("STD input:")
28
+ output_idx = testcase_str.find("Output:")
29
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
30
+ tc_parsing_success = False
31
+ break
32
+
33
+ testcase = {
34
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
35
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
36
+ "output": testcase_str[output_idx + 7 :].strip(),
37
+ }
38
+ question_test_cases.append(testcase)
39
+
40
+ if not tc_parsing_success:
41
+ continue
42
+
43
+ if len(question_test_cases) < self.num_testcases:
44
+ # If not enough test cases, skip this question
45
+ continue
46
+ if self.num_testcases >= 0:
47
+ # If more than one test case is requested, only take the first ones
48
+ question_test_cases = question_test_cases[: self.num_testcases]
49
+
50
+ prompt = (
51
+ f"Question: {target['question_name']} — {target['question_text']}\n\n"
52
+ f"Unit Test Input: {question_test_cases}\n\n"
53
+ if question_test_cases
54
+ else ""
55
+ "Template:\n"
56
+ f"{target['question_template']}\n\n"
57
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
58
+ "DO NOT reproduce the template part as the generated code would be inserted to the template, "
59
+ "and make sure the code is compatible with the Unit Test Input. "
60
+ "int main() is always declared already so DO NOT produce that initialization on the code. "
61
+ "Ensure your code is correct, efficient, includes any class definition when needed, and handles all edge cases properly. "
62
+ "Return the code in C++ code block format, and nothing else."
63
+ )
64
+ instances.append(
65
+ Instance(
66
+ id=f"{question_id}",
67
+ input=Input(text=prompt),
68
+ references=[],
69
+ extra_data={
70
+ "question_template": target["question_template"],
71
+ "test_cases": question_test_cases,
72
+ "question_id": str(question_id) if question_id else None,
73
+ "question_name": target.get("question_name", ""),
74
+ },
75
+ split=VALID_SPLIT,
76
+ )
77
+ )
78
+ return instances
@@ -0,0 +1,192 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+ class CodeInsightsEdgeCaseScenario(Scenario):
7
+ name = "codeinsights_edge_case"
8
+ description = "Evaluate alignment in edge case failure between LLM-generated code and student code"
9
+ tags = ["codeinsights", "c++", "edge_case"]
10
+
11
+ def __init__(self, num_testcases: int = 1):
12
+ super().__init__()
13
+ self.num_testcases = num_testcases
14
+
15
+ def get_instances(self, output_path: str):
16
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario5_data.csv")
17
+
18
+ student_topic = pd.read_csv(
19
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
20
+ )
21
+
22
+ # Load test cases (unit tests)
23
+ test_cases = self._load_test_cases()
24
+
25
+ # Get available question IDs with test cases
26
+ available_question_ids = set()
27
+ if test_cases:
28
+ available_question_ids = set(test_cases.keys())
29
+ print(f"Loaded test cases for {len(available_question_ids)} questions")
30
+ else:
31
+ print("WARNING: No test cases loaded!")
32
+ return []
33
+
34
+ instances = []
35
+ skipped_no_tests = 0
36
+ skipped_insufficient_data = 0
37
+
38
+ for student_id, student_df in df.groupby("student_id"):
39
+ student_df = student_df.sort_values("timestamp")
40
+ target = student_df.iloc[0]
41
+
42
+ # Check if target question has test cases BEFORE processing
43
+ target_question_id = target.get("question_unittest_id", None)
44
+ if not target_question_id or str(target_question_id) not in available_question_ids:
45
+ skipped_no_tests += 1
46
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
47
+ continue
48
+
49
+ # Get test cases for this question (we know they exist now)
50
+ target_test_cases = []
51
+ tc_parsing_success = True
52
+
53
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
54
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
55
+ input_idx = testcase_str.find("Input:")
56
+ std_in_idx = testcase_str.find("STD input:")
57
+ output_idx = testcase_str.find("Output:")
58
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
59
+ tc_parsing_success = False
60
+ break
61
+
62
+ testcase = {
63
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
64
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
65
+ "output": testcase_str[output_idx + 7 :].strip(),
66
+ }
67
+ target_test_cases.append(testcase)
68
+
69
+ # Verify test cases are not empty
70
+ if not tc_parsing_success:
71
+ skipped_no_tests += 1
72
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
73
+ continue
74
+
75
+ if len(target_test_cases) < self.num_testcases:
76
+ # If not enough test cases, skip this question
77
+ continue
78
+ if self.num_testcases >= 0:
79
+ # If more than one test case is requested, only take the first ones
80
+ target_test_cases = target_test_cases[: self.num_testcases]
81
+
82
+ # Get student pass pattern for the target question
83
+ student_correctness_pattern = target.get("pass", None)
84
+ if student_correctness_pattern is not None:
85
+ main_part = int(student_correctness_pattern)
86
+ # Convert each character to an int
87
+ student_correctness_list = [int(ch) for ch in str(main_part)]
88
+ else:
89
+ student_correctness_list = []
90
+
91
+ # Student specific topic performance in previous attempts
92
+ student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
93
+ topic_performance = student_topic[student_topic["student_id"] == student_id]
94
+ for _, row in topic_performance.iterrows():
95
+ topic = row["topic"]
96
+ pass_rate = round(row["pass_rate"], 2)
97
+ perfect = round(row["perfect"], 2)
98
+
99
+ student_level_prompt += (
100
+ f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
101
+ f"and the rate of passing all unit tests is {perfect}.\n"
102
+ )
103
+
104
+ print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
105
+ print(f"Test cases loaded: {len(target_test_cases)}")
106
+ print(f"Student correctness pattern: {student_correctness_list}")
107
+ print(f"Question name: {target.get('question_name', 'MISSING')}")
108
+
109
+ prompt = (
110
+ "You are analyzing a student’s likely mistakes on an upcoming programming problem.\n"
111
+ "Your task: **predict exactly ONE unit-test index (0-based) that the student is most likely to fail.**\n" # noqa: E501
112
+ "Return *only* that integer. No explanation.\n\n"
113
+ "=== Student Profile ===\n"
114
+ f"{student_level_prompt}\n"
115
+ "For the given programming question, identify which unit test the student would fail considering "
116
+ "their past performance, as well as consideration of unit test difficulty.\n"
117
+ f"Week: {target['week']}\n"
118
+ f"Topic: {target['topic']}\n\n"
119
+ f"Question: {target['question_name']} — {target['question_text']}\n"
120
+ f"Unit Tests: {target_test_cases}\n"
121
+ if target_test_cases
122
+ else ""
123
+ "Think silently about:\n"
124
+ "• Which test seems hardest for the given topic?\n"
125
+ "• Where has the student historically struggled?\n"
126
+ "• Any edge-cases in the tests’ inputs/outputs?\n\n"
127
+ "******** ANSWER FORMAT (STRICT) ********\n"
128
+ "<integer>\n"
129
+ "****************************************"
130
+ )
131
+
132
+ instances.append(
133
+ Instance(
134
+ id=f"{student_id}_{target['question_unittest_id']}",
135
+ input=Input(text=prompt),
136
+ references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
137
+ extra_data={
138
+ "question_template": target["question_template"],
139
+ "test_cases": target_test_cases,
140
+ "question_id": str(target_question_id),
141
+ "question_name": target.get("question_name", ""),
142
+ "student_id": str(student_id),
143
+ "student_correctness_pattern": student_correctness_list,
144
+ },
145
+ split=VALID_SPLIT,
146
+ )
147
+ )
148
+
149
+ # Print summary statistics
150
+ print("\n=== INSTANCE CREATION SUMMARY ===")
151
+ print(f"Skipped (insufficient data): {skipped_insufficient_data}")
152
+ print(f"Skipped (no test cases): {skipped_no_tests}")
153
+ print(f"Available test case question IDs: {len(available_question_ids)}")
154
+
155
+ if len(instances) >= 5:
156
+ print("Sample created instances:")
157
+ for i, inst in enumerate(instances[:5]):
158
+ if inst.extra_data is None:
159
+ test_count = 0
160
+ else:
161
+ test_count = len(inst.extra_data.get("test_cases", []))
162
+ print(f" {inst.id}: {test_count} test cases")
163
+
164
+ return instances
165
+
166
+ def _load_test_cases(self):
167
+ """
168
+ Load test cases from external source or return None if not available.
169
+ This method should be implemented based on where your test cases are stored.
170
+
171
+ Expected format:
172
+ {
173
+ "question_id": [
174
+ {
175
+ "unittest": "test_id",
176
+ "input": "test input code",
177
+ "output": "expected output"
178
+ },
179
+ ...
180
+ ],
181
+ ...
182
+ }
183
+ """
184
+ try:
185
+ response = requests.get(
186
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
187
+ )
188
+ if response.status_code == 200:
189
+ return response.json()
190
+ except Exception as e:
191
+ print(f"Failed to load test cases from URL: {e}")
192
+ return {}