crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,192 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+ class CodeInsightsEdgeCaseScenario(Scenario):
7
+ name = "codeinsights_edge_case"
8
+ description = "Evaluate alignment in edge case failure between LLM-generated code and student code"
9
+ tags = ["codeinsights", "c++", "edge_case"]
10
+
11
+ def __init__(self, num_testcases: int = 1):
12
+ super().__init__()
13
+ self.num_testcases = num_testcases
14
+
15
+ def get_instances(self, output_path: str):
16
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario5_data.csv")
17
+
18
+ student_topic = pd.read_csv(
19
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
20
+ )
21
+
22
+ # Load test cases (unit tests)
23
+ test_cases = self._load_test_cases()
24
+
25
+ # Get available question IDs with test cases
26
+ available_question_ids = set()
27
+ if test_cases:
28
+ available_question_ids = set(test_cases.keys())
29
+ print(f"Loaded test cases for {len(available_question_ids)} questions")
30
+ else:
31
+ print("WARNING: No test cases loaded!")
32
+ return []
33
+
34
+ instances = []
35
+ skipped_no_tests = 0
36
+ skipped_insufficient_data = 0
37
+
38
+ for student_id, student_df in df.groupby("student_id"):
39
+ student_df = student_df.sort_values("timestamp")
40
+ target = student_df.iloc[0]
41
+
42
+ # Check if target question has test cases BEFORE processing
43
+ target_question_id = target.get("question_unittest_id", None)
44
+ if not target_question_id or str(target_question_id) not in available_question_ids:
45
+ skipped_no_tests += 1
46
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
47
+ continue
48
+
49
+ # Get test cases for this question (we know they exist now)
50
+ target_test_cases = []
51
+ tc_parsing_success = True
52
+
53
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
54
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
55
+ input_idx = testcase_str.find("Input:")
56
+ std_in_idx = testcase_str.find("STD input:")
57
+ output_idx = testcase_str.find("Output:")
58
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
59
+ tc_parsing_success = False
60
+ break
61
+
62
+ testcase = {
63
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
64
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
65
+ "output": testcase_str[output_idx + 7 :].strip(),
66
+ }
67
+ target_test_cases.append(testcase)
68
+
69
+ # Verify test cases are not empty
70
+ if not tc_parsing_success:
71
+ skipped_no_tests += 1
72
+ print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
73
+ continue
74
+
75
+ if len(target_test_cases) < self.num_testcases:
76
+ # If not enough test cases, skip this question
77
+ continue
78
+ if self.num_testcases >= 0:
79
+ # If more than one test case is requested, only take the first ones
80
+ target_test_cases = target_test_cases[: self.num_testcases]
81
+
82
+ # Get student pass pattern for the target question
83
+ student_correctness_pattern = target.get("pass", None)
84
+ if student_correctness_pattern is not None:
85
+ main_part = int(student_correctness_pattern)
86
+ # Convert each character to an int
87
+ student_correctness_list = [int(ch) for ch in str(main_part)]
88
+ else:
89
+ student_correctness_list = []
90
+
91
+ # Student specific topic performance in previous attempts
92
+ student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
93
+ topic_performance = student_topic[student_topic["student_id"] == student_id]
94
+ for _, row in topic_performance.iterrows():
95
+ topic = row["topic"]
96
+ pass_rate = round(row["pass_rate"], 2)
97
+ perfect = round(row["perfect"], 2)
98
+
99
+ student_level_prompt += (
100
+ f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
101
+ f"and the rate of passing all unit tests is {perfect}.\n"
102
+ )
103
+
104
+ print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
105
+ print(f"Test cases loaded: {len(target_test_cases)}")
106
+ print(f"Student correctness pattern: {student_correctness_list}")
107
+ print(f"Question name: {target.get('question_name', 'MISSING')}")
108
+
109
+ prompt = (
110
+ "You are analyzing a student’s likely mistakes on an upcoming programming problem.\n"
111
+ "Your task: **predict exactly ONE unit-test index (0-based) that the student is most likely to fail.**\n" # noqa: E501
112
+ "Return *only* that integer. No explanation.\n\n"
113
+ "=== Student Profile ===\n"
114
+ f"{student_level_prompt}\n"
115
+ "For the given programming question, identify which unit test the student would fail considering "
116
+ "their past performance, as well as consideration of unit test difficulty.\n"
117
+ f"Week: {target['week']}\n"
118
+ f"Topic: {target['topic']}\n\n"
119
+ f"Question: {target['question_name']} — {target['question_text']}\n"
120
+ f"Unit Tests: {target_test_cases}\n"
121
+ if target_test_cases
122
+ else ""
123
+ "Think silently about:\n"
124
+ "• Which test seems hardest for the given topic?\n"
125
+ "• Where has the student historically struggled?\n"
126
+ "• Any edge-cases in the tests’ inputs/outputs?\n\n"
127
+ "******** ANSWER FORMAT (STRICT) ********\n"
128
+ "<integer>\n"
129
+ "****************************************"
130
+ )
131
+
132
+ instances.append(
133
+ Instance(
134
+ id=f"{student_id}_{target['question_unittest_id']}",
135
+ input=Input(text=prompt),
136
+ references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
137
+ extra_data={
138
+ "question_template": target["question_template"],
139
+ "test_cases": target_test_cases,
140
+ "question_id": str(target_question_id),
141
+ "question_name": target.get("question_name", ""),
142
+ "student_id": str(student_id),
143
+ "student_correctness_pattern": student_correctness_list,
144
+ },
145
+ split=VALID_SPLIT,
146
+ )
147
+ )
148
+
149
+ # Print summary statistics
150
+ print("\n=== INSTANCE CREATION SUMMARY ===")
151
+ print(f"Skipped (insufficient data): {skipped_insufficient_data}")
152
+ print(f"Skipped (no test cases): {skipped_no_tests}")
153
+ print(f"Available test case question IDs: {len(available_question_ids)}")
154
+
155
+ if len(instances) >= 5:
156
+ print("Sample created instances:")
157
+ for i, inst in enumerate(instances[:5]):
158
+ if inst.extra_data is None:
159
+ test_count = 0
160
+ else:
161
+ test_count = len(inst.extra_data.get("test_cases", []))
162
+ print(f" {inst.id}: {test_count} test cases")
163
+
164
+ return instances
165
+
166
+ def _load_test_cases(self):
167
+ """
168
+ Load test cases from external source or return None if not available.
169
+ This method should be implemented based on where your test cases are stored.
170
+
171
+ Expected format:
172
+ {
173
+ "question_id": [
174
+ {
175
+ "unittest": "test_id",
176
+ "input": "test input code",
177
+ "output": "expected output"
178
+ },
179
+ ...
180
+ ],
181
+ ...
182
+ }
183
+ """
184
+ try:
185
+ response = requests.get(
186
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
187
+ )
188
+ if response.status_code == 200:
189
+ return response.json()
190
+ except Exception as e:
191
+ print(f"Failed to load test cases from URL: {e}")
192
+ return {}
@@ -0,0 +1,162 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+ class CodeInsightsStudentCodingScenario(Scenario):
7
+ name = "codeinsights_student_coding"
8
+ description = "Mimic student C++ style on foundational questions"
9
+ tags = ["codeinsights", "c++", "student_coding"]
10
+
11
+ def __init__(self, num_testcases: int = 1):
12
+ super().__init__()
13
+ self.num_testcases = num_testcases
14
+
15
+ def get_instances(self, output_path: str):
16
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario1_2_data.csv")
17
+ student_topic = pd.read_csv(
18
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
19
+ )
20
+
21
+ instances = []
22
+ for student_id, student_df in df.groupby("student_id"):
23
+ student_df = student_df.sort_values("timestamp")
24
+ if len(student_df) < 4:
25
+ continue
26
+ first = student_df.iloc[0]
27
+ second = student_df.iloc[1]
28
+ third = student_df.iloc[2]
29
+ target = student_df.iloc[3]
30
+
31
+ # Get test cases for this question
32
+ question_id = target.get("question_unittest_id", None)
33
+ question_test_cases = []
34
+ tc_parsing_success = True
35
+
36
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
37
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
38
+ input_idx = testcase_str.find("Input:")
39
+ std_in_idx = testcase_str.find("STD input:")
40
+ output_idx = testcase_str.find("Output:")
41
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
42
+ tc_parsing_success = False
43
+ break
44
+
45
+ testcase = {
46
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
47
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
48
+ "output": testcase_str[output_idx + 7 :].strip(),
49
+ }
50
+ question_test_cases.append(testcase)
51
+
52
+ if not tc_parsing_success:
53
+ continue
54
+
55
+ if len(question_test_cases) < self.num_testcases:
56
+ # If not enough test cases, skip this question
57
+ continue
58
+ if self.num_testcases >= 0:
59
+ # If more than one test case is requested, only take the first ones
60
+ question_test_cases = question_test_cases[: self.num_testcases]
61
+
62
+ # Get student pass (0 or 1) for the target question
63
+ student_correctness_pattern = target.get("pass", None)
64
+ main_part = int(student_correctness_pattern) # "1111111111"
65
+ # Convert each character to an int
66
+ student_correctness_list = [int(ch) for ch in str(main_part)] # [1,1,1,1,1,1,1,1,1,1]
67
+
68
+ # Student specific topic performance in previous attempts
69
+ student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
70
+ topic_performance = student_topic[student_topic["student_id"] == student_id]
71
+ for _, row in topic_performance.iterrows():
72
+ topic = row["topic"]
73
+ pass_rate = round(row["pass_rate"], 2)
74
+ perfect = round(row["perfect"], 2)
75
+
76
+ student_level_prompt += (
77
+ f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
78
+ f"and the rate of passing all unit tests is {perfect}.\n"
79
+ )
80
+
81
+ prompt = (
82
+ "=== Student Profile ===\n"
83
+ f"{student_level_prompt}\n"
84
+ f"Week: {target['week']}\n"
85
+ f"Topic: {target['topic']}\n\n"
86
+ "Example 1:\n"
87
+ f"Question: {first['question_name']} — {first['question_text']}\n"
88
+ "Template:\n"
89
+ f"{first['question_template']}\n"
90
+ "Your Code:\n"
91
+ f"{first['response']}\n\n"
92
+ "Example 2:\n"
93
+ f"Question: {second['question_name']} — {second['question_text']}\n"
94
+ "Template:\n"
95
+ f"{second['question_template']}\n"
96
+ "Your Code:\n"
97
+ f"{second['response']}\n\n"
98
+ "Example 3:\n"
99
+ f"Question: {third['question_name']} — {third['question_text']}\n"
100
+ "Template:\n"
101
+ f"{third['question_template']}\n"
102
+ "Your Code:\n"
103
+ f"{third['response']}\n\n"
104
+ "Now, using that same student style, attempt this:\n"
105
+ f"Question: {target['question_name']} — {target['question_text']}\n"
106
+ f"Unit Test Input: {question_test_cases}\n\n"
107
+ if question_test_cases
108
+ else ""
109
+ "Template:\n"
110
+ f"{target['question_template']}\n\n"
111
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
112
+ "DO NOT reproduce the template part as the generated code would be inserted to the template, "
113
+ "and make sure the code is compatible with the Unit Test Input. "
114
+ "int main() is always declared already so DO NOT produce that initialization on the code. "
115
+ "Ensure your code includes any class definition when needed. "
116
+ "Return the code in C++ code block format, and nothing else."
117
+ )
118
+ instances.append(
119
+ Instance(
120
+ id=f"{student_id}_{target['question_unittest_id']}",
121
+ input=Input(text=prompt),
122
+ references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
123
+ extra_data={
124
+ "question_template": target["question_template"],
125
+ "test_cases": question_test_cases,
126
+ "question_id": str(question_id) if question_id else None,
127
+ "question_name": target.get("question_name", ""),
128
+ "student_id": str(student_id),
129
+ "student_correctness_pattern": student_correctness_list,
130
+ },
131
+ split=VALID_SPLIT,
132
+ )
133
+ )
134
+ return instances
135
+
136
+ def _load_test_cases(self):
137
+ """
138
+ Load test cases from external source or return None if not available.
139
+ This method should be implemented based on where your test cases are stored.
140
+
141
+ Expected format:
142
+ {
143
+ "question_id": [
144
+ {
145
+ "unittest": "test_id",
146
+ "input": "test input code",
147
+ "output": "expected output"
148
+ },
149
+ ...
150
+ ],
151
+ ...
152
+ }
153
+ """
154
+ try:
155
+ response = requests.get(
156
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
157
+ )
158
+ if response.status_code == 200:
159
+ return response.json()
160
+ except Exception as e:
161
+ print(f"Failed to load test cases from URL: {e}")
162
+ return {}
@@ -0,0 +1,188 @@
1
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
2
+ import pandas as pd
3
+ import requests
4
+
5
+
6
+ class CodeInsightsStudentMistakeScenario(Scenario):
7
+ name = "codeinsights_student_mistake"
8
+ description = "Mimic how students mistake their C++ codes on foundational questions"
9
+ tags = ["codeinsights", "c++", "student_mistake"]
10
+
11
+ def __init__(self, num_testcases: int = 1):
12
+ super().__init__()
13
+ self.num_testcases = num_testcases
14
+
15
+ def get_instances(self, output_path: str):
16
+ df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario3_data.csv")
17
+ student_topic = pd.read_csv(
18
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
19
+ )
20
+
21
+ instances = []
22
+ for student_id, student_df in df.groupby("student_id"):
23
+ student_df = student_df.sort_values(by=["student_id", "question_unittest_id", "timestamp"])
24
+ if len(student_df) < 4:
25
+ continue
26
+ first = student_df.iloc[0]
27
+ second = student_df.iloc[1]
28
+ third = student_df.iloc[2]
29
+ target = student_df.iloc[3]
30
+
31
+ # Get test cases for this question
32
+ question_id = target.get("question_unittest_id", None)
33
+ question_test_cases = []
34
+ tc_parsing_success = True
35
+
36
+ for testcase_str in target["question_unittests"].split("Unittest")[1:]:
37
+ testcase_str = testcase_str[testcase_str.find(":") + 1 :]
38
+ input_idx = testcase_str.find("Input:")
39
+ std_in_idx = testcase_str.find("STD input:")
40
+ output_idx = testcase_str.find("Output:")
41
+ if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
42
+ tc_parsing_success = False
43
+ break
44
+
45
+ testcase = {
46
+ "input": testcase_str[input_idx + 6 : std_in_idx].strip(),
47
+ "std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
48
+ "output": testcase_str[output_idx + 7 :].strip(),
49
+ }
50
+ question_test_cases.append(testcase)
51
+
52
+ if not tc_parsing_success:
53
+ continue
54
+
55
+ if len(question_test_cases) < self.num_testcases:
56
+ # If not enough test cases, skip this question
57
+ continue
58
+ if self.num_testcases >= 0:
59
+ # If more than one test case is requested, only take the first ones
60
+ question_test_cases = question_test_cases[: self.num_testcases]
61
+
62
+ # Get student pass (0 or 1) for the target question
63
+ student_correctness_pattern = target.get("pass", None)
64
+ main_part = int(student_correctness_pattern) # "1111111111"
65
+ # Convert each character to an int
66
+ student_correctness_list = [int(ch) for ch in str(main_part)] # [1,1,1,1,1,1,1,1,1,1]
67
+
68
+ # Student specific topic performance in previous attempts
69
+ student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
70
+ topic_performance = student_topic[student_topic["student_id"] == student_id]
71
+ for _, row in topic_performance.iterrows():
72
+ topic = row["topic"]
73
+ pass_rate = round(row["pass_rate"], 2)
74
+ perfect = round(row["perfect"], 2)
75
+
76
+ student_level_prompt += (
77
+ f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
78
+ f"and the rate of passing all unit tests is {perfect}.\n"
79
+ )
80
+
81
+ prompt = (
82
+ "=== Student Profile ===\n"
83
+ f"{student_level_prompt}\n"
84
+ "When students submit a code to the platform, it will be tested by number of unit tests, where"
85
+ "- Unit test pass rate = proportion of unit tests passed with the code \n"
86
+ "- Full pass rate = proportion of code passing all unit tests\n\n"
87
+ "=== Past Mistake Examples ===\n"
88
+ "Example 1 (Week {first['week']}, Topic: {first['topic']}):\n"
89
+ f"Question: {first['question_name']} — {first['question_text']}\n"
90
+ "Template:\n"
91
+ f"{first['question_template']}\n"
92
+ "Student's Response Code with Error:\n"
93
+ f"{first['response_mistake']}\n\n"
94
+ "Example 2 (Week {second['week']}, Topic: {second['topic']}):\n"
95
+ f"Question: {second['question_name']} — {second['question_text']}\n"
96
+ "Template:\n"
97
+ f"{second['question_template']}\n"
98
+ "Student's Response Code with Error:\n"
99
+ f"{second['response_mistake']}\n\n"
100
+ "Example 3 (Week {third['week']}, Topic: {third['topic']}):\n"
101
+ f"Question: {third['question_name']} — {third['question_text']}\n"
102
+ "Template:\n"
103
+ f"{third['question_template']}\n"
104
+ "Student's Response Code with Error:\n"
105
+ f"{third['response_mistake']}\n\n"
106
+ "=== New Target Problem ===\n"
107
+ f"Week: {target['week']}, Topic: {target['topic']}\n"
108
+ f"Question: {target['question_name']} — {target['question_text']}\n"
109
+ f"Unit Test Input: {question_test_cases}\n\n"
110
+ if question_test_cases
111
+ else ""
112
+ "Template:\n"
113
+ f"{target['question_template']}\n\n"
114
+ "⚠**Instructions:**\n"
115
+ "1. Mimic your own coding style, naming conventions, indentation, and typical error patterns.\n"
116
+ "2. Introduce mistake you are likely to make (e.g., off‐by‐one index, wrong initialization, "
117
+ "missing edge case).\n"
118
+ "3. Do **not** produce a fully correct solution or add unfamiliar optimizations.\n\n"
119
+ "Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
120
+ "DO NOT reproduce the template part as the generated code would be inserted to the template, "
121
+ "and make sure the code is compatible with the Unit Test Input. "
122
+ "int main() is always declared already so DO NOT produce that initialization on the code. "
123
+ "Ensure your code is includes any class definition when needed. "
124
+ "Return the code in C++ code block format, and nothing else."
125
+ )
126
+
127
+ print(f"\n=== DEBUG INFO FOR STUDENT {student_id}, QUESTION {question_id} ===")
128
+ print(f"Test cases loaded: {len(question_test_cases)}")
129
+ print(f"Student correctness pattern: {student_correctness_list}")
130
+ print(f"Original pass field: {target.get('pass', 'MISSING')}")
131
+ print(f"Question template exists: {'question_template' in target}")
132
+ print(f"Question name: {target.get('question_name', 'MISSING')}")
133
+
134
+ # Also add this validation in your UnitTestAlignmentMetric evaluate_generation method:
135
+ def evaluate_generation(self, adapter_spec, request_state, metric_service, eval_cache_path):
136
+ print("\n=== UNIT TEST METRIC DEBUG ===")
137
+ print(f"Has extra_data: {hasattr(request_state.instance, 'extra_data')}")
138
+ if hasattr(request_state.instance, "extra_data"):
139
+ extra_data = request_state.instance.extra_data
140
+ print(f"Extra data keys: {list(extra_data.keys())}")
141
+ print(f"Test cases: {len(extra_data.get('test_cases', []))}")
142
+ print(f"Student pattern: {extra_data.get('student_correctness_pattern', 'MISSING')}")
143
+
144
+ instances.append(
145
+ Instance(
146
+ id=f"{student_id}_{target['question_unittest_id']}",
147
+ input=Input(text=prompt),
148
+ references=[Reference(output=Output(text=target["response_mistake"]), tags=[CORRECT_TAG])],
149
+ extra_data={
150
+ "question_template": target["question_template"],
151
+ "test_cases": question_test_cases,
152
+ "question_id": str(question_id) if question_id else None,
153
+ "question_name": target.get("question_name", ""),
154
+ "student_id": str(student_id),
155
+ "student_correctness_pattern": student_correctness_list,
156
+ },
157
+ split=VALID_SPLIT,
158
+ )
159
+ )
160
+ return instances
161
+
162
+ def _load_test_cases(self):
163
+ """
164
+ Load test cases from external source or return None if not available.
165
+ This method should be implemented based on where your test cases are stored.
166
+
167
+ Expected format:
168
+ {
169
+ "question_id": [
170
+ {
171
+ "unittest": "test_id",
172
+ "input": "test input code",
173
+ "output": "expected output"
174
+ },
175
+ ...
176
+ ],
177
+ ...
178
+ }
179
+ """
180
+ try:
181
+ response = requests.get(
182
+ "https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
183
+ )
184
+ if response.status_code == 200:
185
+ return response.json()
186
+ except Exception as e:
187
+ print(f"Failed to load test cases from URL: {e}")
188
+ return {}
@@ -0,0 +1,115 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ VALID_SPLIT,
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ TEST_SPLIT,
13
+ TRAIN_SPLIT,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ )
18
+ from helm.common.hierarchical_logger import hwarn
19
+
20
+
21
+ class EXAMSMultilingualScenario(Scenario):
22
+ """EXAMS: A Multi-subject High School Examinations Dataset
23
+
24
+ EXAMS is a benchmark dataset for multilingual and cross-lingual
25
+ question answering from high school examinations. It consists of
26
+ more than 24,000 high-quality high school exam questions in 16
27
+ languages, covering 8 language families and 24 school subjects
28
+ from Natural Sciences and Social Sciences, among others.
29
+
30
+ - https://huggingface.co/datasets/mhardalov/exams
31
+ - https://aclanthology.org/2020.emnlp-main.438/
32
+
33
+ Note: Some dataset rows have the value '@' in the `answerKey` column.
34
+ These rows will be ignored.
35
+
36
+ ```
37
+ @inproceedings{hardalov-etal-2020-exams,
38
+ title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
39
+ author = "Hardalov, Momchil and
40
+ Mihaylov, Todor and
41
+ Zlatkova, Dimitrina and
42
+ Dinkov, Yoan and
43
+ Koychev, Ivan and
44
+ Nakov, Preslav",
45
+ editor = "Webber, Bonnie and
46
+ Cohn, Trevor and
47
+ He, Yulan and
48
+ Liu, Yang",
49
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
50
+ month = nov,
51
+ year = "2020",
52
+ address = "Online",
53
+ publisher = "Association for Computational Linguistics",
54
+ url = "https://aclanthology.org/2020.emnlp-main.438/",
55
+ doi = "10.18653/v1/2020.emnlp-main.438",
56
+ pages = "5427--5444",
57
+ abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
58
+ }```
59
+ """ # noqa: E501
60
+
61
+ name = "exams_multilingual"
62
+ description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
63
+ tags = ["knowledge", "multiple_choice"]
64
+
65
+ CHOICES = ["A", "B", "C", "D", "E"]
66
+ HF_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "test": TEST_SPLIT, "validation": VALID_SPLIT}
67
+
68
+ def __init__(self, language: str, subject: str):
69
+ super().__init__()
70
+ self.language = language
71
+ self.subject = subject
72
+
73
+ def get_instances(self, output_path: str) -> List[Instance]:
74
+ cache_dir = os.path.join(output_path, "data")
75
+ ensure_directory_exists(cache_dir)
76
+ dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
77
+ "mhardalov/exams",
78
+ "multilingual",
79
+ revision="4ff10804abb3341f8815cacd778181177bba7edd",
80
+ cache_dir=cache_dir,
81
+ )
82
+
83
+ # Read all instances
84
+ instances: List[Instance] = []
85
+ for split_name, dataset in dataset_splits.items():
86
+ assert isinstance(dataset, datasets.Dataset)
87
+ for row in dataset:
88
+ question = row["question"]
89
+ question_info = row["info"]
90
+ if self.subject != "all" and question_info["subject"] != self.subject:
91
+ continue
92
+ if self.language != "all" and question_info["language"] != self.language:
93
+ continue
94
+ input = Input(text=question["stem"])
95
+ references: List[Reference] = []
96
+ if row["answerKey"] not in self.CHOICES:
97
+ hwarn(f"Invalid value in answerKey column in row: {row}")
98
+ continue
99
+ correct_choice_index = ord(row["answerKey"]) - ord("A")
100
+ for choice_index, choice_text in enumerate(question["choices"]["text"]):
101
+ references.append(
102
+ Reference(
103
+ output=Output(text=choice_text),
104
+ tags=[CORRECT_TAG] if choice_index == correct_choice_index else [],
105
+ )
106
+ )
107
+ instance = Instance(
108
+ id=row["id"],
109
+ input=input,
110
+ references=references,
111
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
112
+ )
113
+ instances.append(instance)
114
+
115
+ return instances