crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CodeInsightsEdgeCaseScenario(Scenario):
|
|
7
|
+
name = "codeinsights_edge_case"
|
|
8
|
+
description = "Evaluate alignment in edge case failure between LLM-generated code and student code"
|
|
9
|
+
tags = ["codeinsights", "c++", "edge_case"]
|
|
10
|
+
|
|
11
|
+
def __init__(self, num_testcases: int = 1):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.num_testcases = num_testcases
|
|
14
|
+
|
|
15
|
+
def get_instances(self, output_path: str):
|
|
16
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario5_data.csv")
|
|
17
|
+
|
|
18
|
+
student_topic = pd.read_csv(
|
|
19
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Load test cases (unit tests)
|
|
23
|
+
test_cases = self._load_test_cases()
|
|
24
|
+
|
|
25
|
+
# Get available question IDs with test cases
|
|
26
|
+
available_question_ids = set()
|
|
27
|
+
if test_cases:
|
|
28
|
+
available_question_ids = set(test_cases.keys())
|
|
29
|
+
print(f"Loaded test cases for {len(available_question_ids)} questions")
|
|
30
|
+
else:
|
|
31
|
+
print("WARNING: No test cases loaded!")
|
|
32
|
+
return []
|
|
33
|
+
|
|
34
|
+
instances = []
|
|
35
|
+
skipped_no_tests = 0
|
|
36
|
+
skipped_insufficient_data = 0
|
|
37
|
+
|
|
38
|
+
for student_id, student_df in df.groupby("student_id"):
|
|
39
|
+
student_df = student_df.sort_values("timestamp")
|
|
40
|
+
target = student_df.iloc[0]
|
|
41
|
+
|
|
42
|
+
# Check if target question has test cases BEFORE processing
|
|
43
|
+
target_question_id = target.get("question_unittest_id", None)
|
|
44
|
+
if not target_question_id or str(target_question_id) not in available_question_ids:
|
|
45
|
+
skipped_no_tests += 1
|
|
46
|
+
print(f"SKIPPING Student {student_id}, Question {target_question_id}: No test cases available")
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
# Get test cases for this question (we know they exist now)
|
|
50
|
+
target_test_cases = []
|
|
51
|
+
tc_parsing_success = True
|
|
52
|
+
|
|
53
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
54
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
55
|
+
input_idx = testcase_str.find("Input:")
|
|
56
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
57
|
+
output_idx = testcase_str.find("Output:")
|
|
58
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
59
|
+
tc_parsing_success = False
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
testcase = {
|
|
63
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
64
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
65
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
66
|
+
}
|
|
67
|
+
target_test_cases.append(testcase)
|
|
68
|
+
|
|
69
|
+
# Verify test cases are not empty
|
|
70
|
+
if not tc_parsing_success:
|
|
71
|
+
skipped_no_tests += 1
|
|
72
|
+
print(f"SKIPPING Student {student_id}, Question {target_question_id}: Empty test cases")
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
if len(target_test_cases) < self.num_testcases:
|
|
76
|
+
# If not enough test cases, skip this question
|
|
77
|
+
continue
|
|
78
|
+
if self.num_testcases >= 0:
|
|
79
|
+
# If more than one test case is requested, only take the first ones
|
|
80
|
+
target_test_cases = target_test_cases[: self.num_testcases]
|
|
81
|
+
|
|
82
|
+
# Get student pass pattern for the target question
|
|
83
|
+
student_correctness_pattern = target.get("pass", None)
|
|
84
|
+
if student_correctness_pattern is not None:
|
|
85
|
+
main_part = int(student_correctness_pattern)
|
|
86
|
+
# Convert each character to an int
|
|
87
|
+
student_correctness_list = [int(ch) for ch in str(main_part)]
|
|
88
|
+
else:
|
|
89
|
+
student_correctness_list = []
|
|
90
|
+
|
|
91
|
+
# Student specific topic performance in previous attempts
|
|
92
|
+
student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
|
|
93
|
+
topic_performance = student_topic[student_topic["student_id"] == student_id]
|
|
94
|
+
for _, row in topic_performance.iterrows():
|
|
95
|
+
topic = row["topic"]
|
|
96
|
+
pass_rate = round(row["pass_rate"], 2)
|
|
97
|
+
perfect = round(row["perfect"], 2)
|
|
98
|
+
|
|
99
|
+
student_level_prompt += (
|
|
100
|
+
f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
|
|
101
|
+
f"and the rate of passing all unit tests is {perfect}.\n"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
print(f"\n=== ACCEPTED INSTANCE: Student {student_id}, Question {target_question_id} ===")
|
|
105
|
+
print(f"Test cases loaded: {len(target_test_cases)}")
|
|
106
|
+
print(f"Student correctness pattern: {student_correctness_list}")
|
|
107
|
+
print(f"Question name: {target.get('question_name', 'MISSING')}")
|
|
108
|
+
|
|
109
|
+
prompt = (
|
|
110
|
+
"You are analyzing a student’s likely mistakes on an upcoming programming problem.\n"
|
|
111
|
+
"Your task: **predict exactly ONE unit-test index (0-based) that the student is most likely to fail.**\n" # noqa: E501
|
|
112
|
+
"Return *only* that integer. No explanation.\n\n"
|
|
113
|
+
"=== Student Profile ===\n"
|
|
114
|
+
f"{student_level_prompt}\n"
|
|
115
|
+
"For the given programming question, identify which unit test the student would fail considering "
|
|
116
|
+
"their past performance, as well as consideration of unit test difficulty.\n"
|
|
117
|
+
f"Week: {target['week']}\n"
|
|
118
|
+
f"Topic: {target['topic']}\n\n"
|
|
119
|
+
f"Question: {target['question_name']} — {target['question_text']}\n"
|
|
120
|
+
f"Unit Tests: {target_test_cases}\n"
|
|
121
|
+
if target_test_cases
|
|
122
|
+
else ""
|
|
123
|
+
"Think silently about:\n"
|
|
124
|
+
"• Which test seems hardest for the given topic?\n"
|
|
125
|
+
"• Where has the student historically struggled?\n"
|
|
126
|
+
"• Any edge-cases in the tests’ inputs/outputs?\n\n"
|
|
127
|
+
"******** ANSWER FORMAT (STRICT) ********\n"
|
|
128
|
+
"<integer>\n"
|
|
129
|
+
"****************************************"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
instances.append(
|
|
133
|
+
Instance(
|
|
134
|
+
id=f"{student_id}_{target['question_unittest_id']}",
|
|
135
|
+
input=Input(text=prompt),
|
|
136
|
+
references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
|
|
137
|
+
extra_data={
|
|
138
|
+
"question_template": target["question_template"],
|
|
139
|
+
"test_cases": target_test_cases,
|
|
140
|
+
"question_id": str(target_question_id),
|
|
141
|
+
"question_name": target.get("question_name", ""),
|
|
142
|
+
"student_id": str(student_id),
|
|
143
|
+
"student_correctness_pattern": student_correctness_list,
|
|
144
|
+
},
|
|
145
|
+
split=VALID_SPLIT,
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Print summary statistics
|
|
150
|
+
print("\n=== INSTANCE CREATION SUMMARY ===")
|
|
151
|
+
print(f"Skipped (insufficient data): {skipped_insufficient_data}")
|
|
152
|
+
print(f"Skipped (no test cases): {skipped_no_tests}")
|
|
153
|
+
print(f"Available test case question IDs: {len(available_question_ids)}")
|
|
154
|
+
|
|
155
|
+
if len(instances) >= 5:
|
|
156
|
+
print("Sample created instances:")
|
|
157
|
+
for i, inst in enumerate(instances[:5]):
|
|
158
|
+
if inst.extra_data is None:
|
|
159
|
+
test_count = 0
|
|
160
|
+
else:
|
|
161
|
+
test_count = len(inst.extra_data.get("test_cases", []))
|
|
162
|
+
print(f" {inst.id}: {test_count} test cases")
|
|
163
|
+
|
|
164
|
+
return instances
|
|
165
|
+
|
|
166
|
+
def _load_test_cases(self):
|
|
167
|
+
"""
|
|
168
|
+
Load test cases from external source or return None if not available.
|
|
169
|
+
This method should be implemented based on where your test cases are stored.
|
|
170
|
+
|
|
171
|
+
Expected format:
|
|
172
|
+
{
|
|
173
|
+
"question_id": [
|
|
174
|
+
{
|
|
175
|
+
"unittest": "test_id",
|
|
176
|
+
"input": "test input code",
|
|
177
|
+
"output": "expected output"
|
|
178
|
+
},
|
|
179
|
+
...
|
|
180
|
+
],
|
|
181
|
+
...
|
|
182
|
+
}
|
|
183
|
+
"""
|
|
184
|
+
try:
|
|
185
|
+
response = requests.get(
|
|
186
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
|
|
187
|
+
)
|
|
188
|
+
if response.status_code == 200:
|
|
189
|
+
return response.json()
|
|
190
|
+
except Exception as e:
|
|
191
|
+
print(f"Failed to load test cases from URL: {e}")
|
|
192
|
+
return {}
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CodeInsightsStudentCodingScenario(Scenario):
|
|
7
|
+
name = "codeinsights_student_coding"
|
|
8
|
+
description = "Mimic student C++ style on foundational questions"
|
|
9
|
+
tags = ["codeinsights", "c++", "student_coding"]
|
|
10
|
+
|
|
11
|
+
def __init__(self, num_testcases: int = 1):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.num_testcases = num_testcases
|
|
14
|
+
|
|
15
|
+
def get_instances(self, output_path: str):
|
|
16
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario1_2_data.csv")
|
|
17
|
+
student_topic = pd.read_csv(
|
|
18
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
instances = []
|
|
22
|
+
for student_id, student_df in df.groupby("student_id"):
|
|
23
|
+
student_df = student_df.sort_values("timestamp")
|
|
24
|
+
if len(student_df) < 4:
|
|
25
|
+
continue
|
|
26
|
+
first = student_df.iloc[0]
|
|
27
|
+
second = student_df.iloc[1]
|
|
28
|
+
third = student_df.iloc[2]
|
|
29
|
+
target = student_df.iloc[3]
|
|
30
|
+
|
|
31
|
+
# Get test cases for this question
|
|
32
|
+
question_id = target.get("question_unittest_id", None)
|
|
33
|
+
question_test_cases = []
|
|
34
|
+
tc_parsing_success = True
|
|
35
|
+
|
|
36
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
37
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
38
|
+
input_idx = testcase_str.find("Input:")
|
|
39
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
40
|
+
output_idx = testcase_str.find("Output:")
|
|
41
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
42
|
+
tc_parsing_success = False
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
testcase = {
|
|
46
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
47
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
48
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
49
|
+
}
|
|
50
|
+
question_test_cases.append(testcase)
|
|
51
|
+
|
|
52
|
+
if not tc_parsing_success:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
if len(question_test_cases) < self.num_testcases:
|
|
56
|
+
# If not enough test cases, skip this question
|
|
57
|
+
continue
|
|
58
|
+
if self.num_testcases >= 0:
|
|
59
|
+
# If more than one test case is requested, only take the first ones
|
|
60
|
+
question_test_cases = question_test_cases[: self.num_testcases]
|
|
61
|
+
|
|
62
|
+
# Get student pass (0 or 1) for the target question
|
|
63
|
+
student_correctness_pattern = target.get("pass", None)
|
|
64
|
+
main_part = int(student_correctness_pattern) # "1111111111"
|
|
65
|
+
# Convert each character to an int
|
|
66
|
+
student_correctness_list = [int(ch) for ch in str(main_part)] # [1,1,1,1,1,1,1,1,1,1]
|
|
67
|
+
|
|
68
|
+
# Student specific topic performance in previous attempts
|
|
69
|
+
student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
|
|
70
|
+
topic_performance = student_topic[student_topic["student_id"] == student_id]
|
|
71
|
+
for _, row in topic_performance.iterrows():
|
|
72
|
+
topic = row["topic"]
|
|
73
|
+
pass_rate = round(row["pass_rate"], 2)
|
|
74
|
+
perfect = round(row["perfect"], 2)
|
|
75
|
+
|
|
76
|
+
student_level_prompt += (
|
|
77
|
+
f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
|
|
78
|
+
f"and the rate of passing all unit tests is {perfect}.\n"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
prompt = (
|
|
82
|
+
"=== Student Profile ===\n"
|
|
83
|
+
f"{student_level_prompt}\n"
|
|
84
|
+
f"Week: {target['week']}\n"
|
|
85
|
+
f"Topic: {target['topic']}\n\n"
|
|
86
|
+
"Example 1:\n"
|
|
87
|
+
f"Question: {first['question_name']} — {first['question_text']}\n"
|
|
88
|
+
"Template:\n"
|
|
89
|
+
f"{first['question_template']}\n"
|
|
90
|
+
"Your Code:\n"
|
|
91
|
+
f"{first['response']}\n\n"
|
|
92
|
+
"Example 2:\n"
|
|
93
|
+
f"Question: {second['question_name']} — {second['question_text']}\n"
|
|
94
|
+
"Template:\n"
|
|
95
|
+
f"{second['question_template']}\n"
|
|
96
|
+
"Your Code:\n"
|
|
97
|
+
f"{second['response']}\n\n"
|
|
98
|
+
"Example 3:\n"
|
|
99
|
+
f"Question: {third['question_name']} — {third['question_text']}\n"
|
|
100
|
+
"Template:\n"
|
|
101
|
+
f"{third['question_template']}\n"
|
|
102
|
+
"Your Code:\n"
|
|
103
|
+
f"{third['response']}\n\n"
|
|
104
|
+
"Now, using that same student style, attempt this:\n"
|
|
105
|
+
f"Question: {target['question_name']} — {target['question_text']}\n"
|
|
106
|
+
f"Unit Test Input: {question_test_cases}\n\n"
|
|
107
|
+
if question_test_cases
|
|
108
|
+
else ""
|
|
109
|
+
"Template:\n"
|
|
110
|
+
f"{target['question_template']}\n\n"
|
|
111
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
|
|
112
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template, "
|
|
113
|
+
"and make sure the code is compatible with the Unit Test Input. "
|
|
114
|
+
"int main() is always declared already so DO NOT produce that initialization on the code. "
|
|
115
|
+
"Ensure your code includes any class definition when needed. "
|
|
116
|
+
"Return the code in C++ code block format, and nothing else."
|
|
117
|
+
)
|
|
118
|
+
instances.append(
|
|
119
|
+
Instance(
|
|
120
|
+
id=f"{student_id}_{target['question_unittest_id']}",
|
|
121
|
+
input=Input(text=prompt),
|
|
122
|
+
references=[Reference(output=Output(text=target["response"]), tags=[CORRECT_TAG])],
|
|
123
|
+
extra_data={
|
|
124
|
+
"question_template": target["question_template"],
|
|
125
|
+
"test_cases": question_test_cases,
|
|
126
|
+
"question_id": str(question_id) if question_id else None,
|
|
127
|
+
"question_name": target.get("question_name", ""),
|
|
128
|
+
"student_id": str(student_id),
|
|
129
|
+
"student_correctness_pattern": student_correctness_list,
|
|
130
|
+
},
|
|
131
|
+
split=VALID_SPLIT,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
return instances
|
|
135
|
+
|
|
136
|
+
def _load_test_cases(self):
|
|
137
|
+
"""
|
|
138
|
+
Load test cases from external source or return None if not available.
|
|
139
|
+
This method should be implemented based on where your test cases are stored.
|
|
140
|
+
|
|
141
|
+
Expected format:
|
|
142
|
+
{
|
|
143
|
+
"question_id": [
|
|
144
|
+
{
|
|
145
|
+
"unittest": "test_id",
|
|
146
|
+
"input": "test input code",
|
|
147
|
+
"output": "expected output"
|
|
148
|
+
},
|
|
149
|
+
...
|
|
150
|
+
],
|
|
151
|
+
...
|
|
152
|
+
}
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
response = requests.get(
|
|
156
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
|
|
157
|
+
)
|
|
158
|
+
if response.status_code == 200:
|
|
159
|
+
return response.json()
|
|
160
|
+
except Exception as e:
|
|
161
|
+
print(f"Failed to load test cases from URL: {e}")
|
|
162
|
+
return {}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, VALID_SPLIT, CORRECT_TAG
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CodeInsightsStudentMistakeScenario(Scenario):
|
|
7
|
+
name = "codeinsights_student_mistake"
|
|
8
|
+
description = "Mimic how students mistake their C++ codes on foundational questions"
|
|
9
|
+
tags = ["codeinsights", "c++", "student_mistake"]
|
|
10
|
+
|
|
11
|
+
def __init__(self, num_testcases: int = 1):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.num_testcases = num_testcases
|
|
14
|
+
|
|
15
|
+
def get_instances(self, output_path: str):
|
|
16
|
+
df = pd.read_csv("https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/Scenario3_data.csv")
|
|
17
|
+
student_topic = pd.read_csv(
|
|
18
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/student_performace_by_topic.csv"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
instances = []
|
|
22
|
+
for student_id, student_df in df.groupby("student_id"):
|
|
23
|
+
student_df = student_df.sort_values(by=["student_id", "question_unittest_id", "timestamp"])
|
|
24
|
+
if len(student_df) < 4:
|
|
25
|
+
continue
|
|
26
|
+
first = student_df.iloc[0]
|
|
27
|
+
second = student_df.iloc[1]
|
|
28
|
+
third = student_df.iloc[2]
|
|
29
|
+
target = student_df.iloc[3]
|
|
30
|
+
|
|
31
|
+
# Get test cases for this question
|
|
32
|
+
question_id = target.get("question_unittest_id", None)
|
|
33
|
+
question_test_cases = []
|
|
34
|
+
tc_parsing_success = True
|
|
35
|
+
|
|
36
|
+
for testcase_str in target["question_unittests"].split("Unittest")[1:]:
|
|
37
|
+
testcase_str = testcase_str[testcase_str.find(":") + 1 :]
|
|
38
|
+
input_idx = testcase_str.find("Input:")
|
|
39
|
+
std_in_idx = testcase_str.find("STD input:")
|
|
40
|
+
output_idx = testcase_str.find("Output:")
|
|
41
|
+
if input_idx == -1 or std_in_idx == -1 or output_idx == -1:
|
|
42
|
+
tc_parsing_success = False
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
testcase = {
|
|
46
|
+
"input": testcase_str[input_idx + 6 : std_in_idx].strip(),
|
|
47
|
+
"std_in": testcase_str[std_in_idx + 10 : output_idx].strip(),
|
|
48
|
+
"output": testcase_str[output_idx + 7 :].strip(),
|
|
49
|
+
}
|
|
50
|
+
question_test_cases.append(testcase)
|
|
51
|
+
|
|
52
|
+
if not tc_parsing_success:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
if len(question_test_cases) < self.num_testcases:
|
|
56
|
+
# If not enough test cases, skip this question
|
|
57
|
+
continue
|
|
58
|
+
if self.num_testcases >= 0:
|
|
59
|
+
# If more than one test case is requested, only take the first ones
|
|
60
|
+
question_test_cases = question_test_cases[: self.num_testcases]
|
|
61
|
+
|
|
62
|
+
# Get student pass (0 or 1) for the target question
|
|
63
|
+
student_correctness_pattern = target.get("pass", None)
|
|
64
|
+
main_part = int(student_correctness_pattern) # "1111111111"
|
|
65
|
+
# Convert each character to an int
|
|
66
|
+
student_correctness_list = [int(ch) for ch in str(main_part)] # [1,1,1,1,1,1,1,1,1,1]
|
|
67
|
+
|
|
68
|
+
# Student specific topic performance in previous attempts
|
|
69
|
+
student_level_prompt = f"Student {student_id} has the following performance across topics:\n"
|
|
70
|
+
topic_performance = student_topic[student_topic["student_id"] == student_id]
|
|
71
|
+
for _, row in topic_performance.iterrows():
|
|
72
|
+
topic = row["topic"]
|
|
73
|
+
pass_rate = round(row["pass_rate"], 2)
|
|
74
|
+
perfect = round(row["perfect"], 2)
|
|
75
|
+
|
|
76
|
+
student_level_prompt += (
|
|
77
|
+
f"- For topic '{topic}', the unit test pass rate is {pass_rate}, "
|
|
78
|
+
f"and the rate of passing all unit tests is {perfect}.\n"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
prompt = (
|
|
82
|
+
"=== Student Profile ===\n"
|
|
83
|
+
f"{student_level_prompt}\n"
|
|
84
|
+
"When students submit a code to the platform, it will be tested by number of unit tests, where"
|
|
85
|
+
"- Unit test pass rate = proportion of unit tests passed with the code \n"
|
|
86
|
+
"- Full pass rate = proportion of code passing all unit tests\n\n"
|
|
87
|
+
"=== Past Mistake Examples ===\n"
|
|
88
|
+
"Example 1 (Week {first['week']}, Topic: {first['topic']}):\n"
|
|
89
|
+
f"Question: {first['question_name']} — {first['question_text']}\n"
|
|
90
|
+
"Template:\n"
|
|
91
|
+
f"{first['question_template']}\n"
|
|
92
|
+
"Student's Response Code with Error:\n"
|
|
93
|
+
f"{first['response_mistake']}\n\n"
|
|
94
|
+
"Example 2 (Week {second['week']}, Topic: {second['topic']}):\n"
|
|
95
|
+
f"Question: {second['question_name']} — {second['question_text']}\n"
|
|
96
|
+
"Template:\n"
|
|
97
|
+
f"{second['question_template']}\n"
|
|
98
|
+
"Student's Response Code with Error:\n"
|
|
99
|
+
f"{second['response_mistake']}\n\n"
|
|
100
|
+
"Example 3 (Week {third['week']}, Topic: {third['topic']}):\n"
|
|
101
|
+
f"Question: {third['question_name']} — {third['question_text']}\n"
|
|
102
|
+
"Template:\n"
|
|
103
|
+
f"{third['question_template']}\n"
|
|
104
|
+
"Student's Response Code with Error:\n"
|
|
105
|
+
f"{third['response_mistake']}\n\n"
|
|
106
|
+
"=== New Target Problem ===\n"
|
|
107
|
+
f"Week: {target['week']}, Topic: {target['topic']}\n"
|
|
108
|
+
f"Question: {target['question_name']} — {target['question_text']}\n"
|
|
109
|
+
f"Unit Test Input: {question_test_cases}\n\n"
|
|
110
|
+
if question_test_cases
|
|
111
|
+
else ""
|
|
112
|
+
"Template:\n"
|
|
113
|
+
f"{target['question_template']}\n\n"
|
|
114
|
+
"⚠**Instructions:**\n"
|
|
115
|
+
"1. Mimic your own coding style, naming conventions, indentation, and typical error patterns.\n"
|
|
116
|
+
"2. Introduce mistake you are likely to make (e.g., off‐by‐one index, wrong initialization, "
|
|
117
|
+
"missing edge case).\n"
|
|
118
|
+
"3. Do **not** produce a fully correct solution or add unfamiliar optimizations.\n\n"
|
|
119
|
+
"Provide ONLY your C++ implementation following the given template, where the answer will replace the {{ STUDENT_ANSWER }} block in the template. "
|
|
120
|
+
"DO NOT reproduce the template part as the generated code would be inserted to the template, "
|
|
121
|
+
"and make sure the code is compatible with the Unit Test Input. "
|
|
122
|
+
"int main() is always declared already so DO NOT produce that initialization on the code. "
|
|
123
|
+
"Ensure your code is includes any class definition when needed. "
|
|
124
|
+
"Return the code in C++ code block format, and nothing else."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
print(f"\n=== DEBUG INFO FOR STUDENT {student_id}, QUESTION {question_id} ===")
|
|
128
|
+
print(f"Test cases loaded: {len(question_test_cases)}")
|
|
129
|
+
print(f"Student correctness pattern: {student_correctness_list}")
|
|
130
|
+
print(f"Original pass field: {target.get('pass', 'MISSING')}")
|
|
131
|
+
print(f"Question template exists: {'question_template' in target}")
|
|
132
|
+
print(f"Question name: {target.get('question_name', 'MISSING')}")
|
|
133
|
+
|
|
134
|
+
# Also add this validation in your UnitTestAlignmentMetric evaluate_generation method:
|
|
135
|
+
def evaluate_generation(self, adapter_spec, request_state, metric_service, eval_cache_path):
|
|
136
|
+
print("\n=== UNIT TEST METRIC DEBUG ===")
|
|
137
|
+
print(f"Has extra_data: {hasattr(request_state.instance, 'extra_data')}")
|
|
138
|
+
if hasattr(request_state.instance, "extra_data"):
|
|
139
|
+
extra_data = request_state.instance.extra_data
|
|
140
|
+
print(f"Extra data keys: {list(extra_data.keys())}")
|
|
141
|
+
print(f"Test cases: {len(extra_data.get('test_cases', []))}")
|
|
142
|
+
print(f"Student pattern: {extra_data.get('student_correctness_pattern', 'MISSING')}")
|
|
143
|
+
|
|
144
|
+
instances.append(
|
|
145
|
+
Instance(
|
|
146
|
+
id=f"{student_id}_{target['question_unittest_id']}",
|
|
147
|
+
input=Input(text=prompt),
|
|
148
|
+
references=[Reference(output=Output(text=target["response_mistake"]), tags=[CORRECT_TAG])],
|
|
149
|
+
extra_data={
|
|
150
|
+
"question_template": target["question_template"],
|
|
151
|
+
"test_cases": question_test_cases,
|
|
152
|
+
"question_id": str(question_id) if question_id else None,
|
|
153
|
+
"question_name": target.get("question_name", ""),
|
|
154
|
+
"student_id": str(student_id),
|
|
155
|
+
"student_correctness_pattern": student_correctness_list,
|
|
156
|
+
},
|
|
157
|
+
split=VALID_SPLIT,
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
return instances
|
|
161
|
+
|
|
162
|
+
def _load_test_cases(self):
|
|
163
|
+
"""
|
|
164
|
+
Load test cases from external source or return None if not available.
|
|
165
|
+
This method should be implemented based on where your test cases are stored.
|
|
166
|
+
|
|
167
|
+
Expected format:
|
|
168
|
+
{
|
|
169
|
+
"question_id": [
|
|
170
|
+
{
|
|
171
|
+
"unittest": "test_id",
|
|
172
|
+
"input": "test input code",
|
|
173
|
+
"output": "expected output"
|
|
174
|
+
},
|
|
175
|
+
...
|
|
176
|
+
],
|
|
177
|
+
...
|
|
178
|
+
}
|
|
179
|
+
"""
|
|
180
|
+
try:
|
|
181
|
+
response = requests.get(
|
|
182
|
+
"https://huggingface.co/datasets/Kazchoko/my_dataset/resolve/main/test_cases_by_qid.json"
|
|
183
|
+
)
|
|
184
|
+
if response.status_code == 200:
|
|
185
|
+
return response.json()
|
|
186
|
+
except Exception as e:
|
|
187
|
+
print(f"Failed to load test cases from URL: {e}")
|
|
188
|
+
return {}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
VALID_SPLIT,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
TRAIN_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
from helm.common.hierarchical_logger import hwarn
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EXAMSMultilingualScenario(Scenario):
|
|
22
|
+
"""EXAMS: A Multi-subject High School Examinations Dataset
|
|
23
|
+
|
|
24
|
+
EXAMS is a benchmark dataset for multilingual and cross-lingual
|
|
25
|
+
question answering from high school examinations. It consists of
|
|
26
|
+
more than 24,000 high-quality high school exam questions in 16
|
|
27
|
+
languages, covering 8 language families and 24 school subjects
|
|
28
|
+
from Natural Sciences and Social Sciences, among others.
|
|
29
|
+
|
|
30
|
+
- https://huggingface.co/datasets/mhardalov/exams
|
|
31
|
+
- https://aclanthology.org/2020.emnlp-main.438/
|
|
32
|
+
|
|
33
|
+
Note: Some dataset rows have the value '@' in the `answerKey` column.
|
|
34
|
+
These rows will be ignored.
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
@inproceedings{hardalov-etal-2020-exams,
|
|
38
|
+
title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
|
|
39
|
+
author = "Hardalov, Momchil and
|
|
40
|
+
Mihaylov, Todor and
|
|
41
|
+
Zlatkova, Dimitrina and
|
|
42
|
+
Dinkov, Yoan and
|
|
43
|
+
Koychev, Ivan and
|
|
44
|
+
Nakov, Preslav",
|
|
45
|
+
editor = "Webber, Bonnie and
|
|
46
|
+
Cohn, Trevor and
|
|
47
|
+
He, Yulan and
|
|
48
|
+
Liu, Yang",
|
|
49
|
+
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
|
|
50
|
+
month = nov,
|
|
51
|
+
year = "2020",
|
|
52
|
+
address = "Online",
|
|
53
|
+
publisher = "Association for Computational Linguistics",
|
|
54
|
+
url = "https://aclanthology.org/2020.emnlp-main.438/",
|
|
55
|
+
doi = "10.18653/v1/2020.emnlp-main.438",
|
|
56
|
+
pages = "5427--5444",
|
|
57
|
+
abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
|
|
58
|
+
}```
|
|
59
|
+
""" # noqa: E501
|
|
60
|
+
|
|
61
|
+
name = "exams_multilingual"
|
|
62
|
+
description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
|
|
63
|
+
tags = ["knowledge", "multiple_choice"]
|
|
64
|
+
|
|
65
|
+
CHOICES = ["A", "B", "C", "D", "E"]
|
|
66
|
+
HF_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "test": TEST_SPLIT, "validation": VALID_SPLIT}
|
|
67
|
+
|
|
68
|
+
def __init__(self, language: str, subject: str):
|
|
69
|
+
super().__init__()
|
|
70
|
+
self.language = language
|
|
71
|
+
self.subject = subject
|
|
72
|
+
|
|
73
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
74
|
+
cache_dir = os.path.join(output_path, "data")
|
|
75
|
+
ensure_directory_exists(cache_dir)
|
|
76
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
77
|
+
"mhardalov/exams",
|
|
78
|
+
"multilingual",
|
|
79
|
+
revision="4ff10804abb3341f8815cacd778181177bba7edd",
|
|
80
|
+
cache_dir=cache_dir,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Read all instances
|
|
84
|
+
instances: List[Instance] = []
|
|
85
|
+
for split_name, dataset in dataset_splits.items():
|
|
86
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
87
|
+
for row in dataset:
|
|
88
|
+
question = row["question"]
|
|
89
|
+
question_info = row["info"]
|
|
90
|
+
if self.subject != "all" and question_info["subject"] != self.subject:
|
|
91
|
+
continue
|
|
92
|
+
if self.language != "all" and question_info["language"] != self.language:
|
|
93
|
+
continue
|
|
94
|
+
input = Input(text=question["stem"])
|
|
95
|
+
references: List[Reference] = []
|
|
96
|
+
if row["answerKey"] not in self.CHOICES:
|
|
97
|
+
hwarn(f"Invalid value in answerKey column in row: {row}")
|
|
98
|
+
continue
|
|
99
|
+
correct_choice_index = ord(row["answerKey"]) - ord("A")
|
|
100
|
+
for choice_index, choice_text in enumerate(question["choices"]["text"]):
|
|
101
|
+
references.append(
|
|
102
|
+
Reference(
|
|
103
|
+
output=Output(text=choice_text),
|
|
104
|
+
tags=[CORRECT_TAG] if choice_index == correct_choice_index else [],
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
instance = Instance(
|
|
108
|
+
id=row["id"],
|
|
109
|
+
input=input,
|
|
110
|
+
references=references,
|
|
111
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
112
|
+
)
|
|
113
|
+
instances.append(instance)
|
|
114
|
+
|
|
115
|
+
return instances
|