crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
from typing import List, Tuple, Dict, Any
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
from helm.benchmark.metrics.codeinsights_correct_code_metrics import (
|
|
10
|
+
CodeInsightsFunctionalCorrectnessMetric,
|
|
11
|
+
CPPEvaluator,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CodeInsightsCodeEfficiencyMetric(CodeInsightsFunctionalCorrectnessMetric):
|
|
16
|
+
"""
|
|
17
|
+
Comprehensive metric combining functional correctness and runtime efficiency evaluation.
|
|
18
|
+
|
|
19
|
+
This metric first evaluates functional correctness and then measures runtime efficiency
|
|
20
|
+
alignment between LLM-generated code and student reference code when both are correct.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
num_runtime_runs: int = 5,
|
|
26
|
+
timeout_seconds: int = 10,
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Initializes the CodeInsightsFunctionalCorrectnessMetric.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
timeout (int): Timeout for each test case execution.
|
|
33
|
+
"""
|
|
34
|
+
super().__init__()
|
|
35
|
+
self.num_runtime_runs = num_runtime_runs
|
|
36
|
+
self.timeout_seconds = timeout_seconds
|
|
37
|
+
|
|
38
|
+
def evaluate_generation(
|
|
39
|
+
self,
|
|
40
|
+
adapter_spec: AdapterSpec,
|
|
41
|
+
request_state: RequestState,
|
|
42
|
+
metric_service: MetricService,
|
|
43
|
+
eval_cache_path: str,
|
|
44
|
+
) -> List[Stat]:
|
|
45
|
+
"""
|
|
46
|
+
Evaluate LLM-generated code by running unit tests and computing pass rate.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of Stat objects containing the functional correctness score
|
|
50
|
+
"""
|
|
51
|
+
print("\n=== FUNCTIONAL CORRECTNESS METRIC DEBUG ===")
|
|
52
|
+
print(f"Instance ID: {getattr(request_state.instance, 'id', 'UNKNOWN')}")
|
|
53
|
+
|
|
54
|
+
# Get the generated code from the request state
|
|
55
|
+
if not request_state.result or not request_state.result.completions:
|
|
56
|
+
print("ERROR: No output generated")
|
|
57
|
+
return self._create_failure_stats("No output generated")
|
|
58
|
+
|
|
59
|
+
generated_code = request_state.result.completions[0].text.strip()
|
|
60
|
+
generated_code = self._extract_student_code(generated_code)
|
|
61
|
+
print(f"Generated code length: {len(generated_code)}")
|
|
62
|
+
print(f"Generated code preview: {generated_code[:200]}...")
|
|
63
|
+
|
|
64
|
+
# Get the student code from the instance references
|
|
65
|
+
student_code = request_state.instance.references[0].output.text.strip()
|
|
66
|
+
print(f"Student code length: {len(student_code)}")
|
|
67
|
+
|
|
68
|
+
# Get test cases from instance extra_data
|
|
69
|
+
if not hasattr(request_state.instance, "extra_data") or not request_state.instance.extra_data:
|
|
70
|
+
print("ERROR: No extra_data available")
|
|
71
|
+
print(f"Instance attributes: {dir(request_state.instance)}")
|
|
72
|
+
return self._create_failure_stats("No test data available")
|
|
73
|
+
|
|
74
|
+
extra_data = request_state.instance.extra_data
|
|
75
|
+
print(f"Extra data keys: {list(extra_data.keys())}")
|
|
76
|
+
|
|
77
|
+
test_cases = extra_data.get("test_cases", [])
|
|
78
|
+
question_template = extra_data.get("question_template", "")
|
|
79
|
+
question_name = extra_data.get("question_name", "UNKNOWN")
|
|
80
|
+
|
|
81
|
+
print(f"Question name: {question_name}")
|
|
82
|
+
print(f"Number of test cases: {len(test_cases)}")
|
|
83
|
+
print(f"Template length: {len(question_template)}")
|
|
84
|
+
|
|
85
|
+
if not test_cases:
|
|
86
|
+
print("ERROR: No test cases available")
|
|
87
|
+
return self._create_failure_stats("No test cases available")
|
|
88
|
+
|
|
89
|
+
print(f"First test case preview: {test_cases[0] if test_cases else 'NONE'}")
|
|
90
|
+
|
|
91
|
+
# Run unit tests and calculate pass rate
|
|
92
|
+
evaluator = CPPEvaluator(
|
|
93
|
+
question_template,
|
|
94
|
+
test_cases,
|
|
95
|
+
timeout=self.timeout_seconds,
|
|
96
|
+
max_workers=1,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
llm_output, llm_avg_runtime = self._timed_run(evaluator, generated_code, self.num_runtime_runs)
|
|
100
|
+
stu_output, stu_avg_runtime = self._timed_run(evaluator, student_code, self.num_runtime_runs)
|
|
101
|
+
|
|
102
|
+
# Compute functional correctness score
|
|
103
|
+
if not llm_output or "score" not in llm_output:
|
|
104
|
+
stats = [Stat(MetricName("functional_correctness")).add(0.0)]
|
|
105
|
+
else:
|
|
106
|
+
stats = [Stat(MetricName("functional_correctness")).add(llm_output["score"])]
|
|
107
|
+
|
|
108
|
+
# Calculate runtime metrics if we have data for both solutions
|
|
109
|
+
if llm_avg_runtime > 0 and stu_avg_runtime > 0:
|
|
110
|
+
# Runtime ratio (LLM / Student) - values > 1 mean LLM is slower
|
|
111
|
+
runtime_ratio = llm_avg_runtime / stu_avg_runtime if stu_avg_runtime > 0 else float("inf")
|
|
112
|
+
|
|
113
|
+
# Efficiency alignment score (closer to 1.0 is better alignment)
|
|
114
|
+
# Use reciprocal if LLM is faster to normalize the scale
|
|
115
|
+
if runtime_ratio > 1:
|
|
116
|
+
efficiency_alignment = 1.0 / runtime_ratio
|
|
117
|
+
else:
|
|
118
|
+
efficiency_alignment = runtime_ratio
|
|
119
|
+
|
|
120
|
+
print(f"Runtime ratio (LLM/Student): {runtime_ratio:.4f}")
|
|
121
|
+
print(f"Efficiency alignment score: {efficiency_alignment:.4f}")
|
|
122
|
+
|
|
123
|
+
stats.extend(
|
|
124
|
+
[
|
|
125
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(runtime_ratio),
|
|
126
|
+
Stat(MetricName("efficiency_alignment_score")).add(efficiency_alignment),
|
|
127
|
+
]
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Handle cases where only one solution has runtime data
|
|
131
|
+
elif llm_avg_runtime > 0 and stu_avg_runtime <= 0:
|
|
132
|
+
print("Only LLM runtime available - student solution failed to run")
|
|
133
|
+
stats.extend(
|
|
134
|
+
[
|
|
135
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(float("inf")), # LLM runs, student doesn't
|
|
136
|
+
Stat(MetricName("efficiency_alignment_score")).add(0.0), # No alignment possible
|
|
137
|
+
]
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
elif llm_avg_runtime <= 0 and stu_avg_runtime > 0:
|
|
141
|
+
print("Only student runtime available - LLM solution failed to run")
|
|
142
|
+
stats.extend(
|
|
143
|
+
[
|
|
144
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(0.0), # Student runs, LLM doesn't
|
|
145
|
+
Stat(MetricName("efficiency_alignment_score")).add(0.0), # No alignment possible
|
|
146
|
+
]
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
else:
|
|
150
|
+
# Neither solution has runtime data
|
|
151
|
+
print("Runtime measurement failed for both solutions")
|
|
152
|
+
stats.extend(
|
|
153
|
+
[
|
|
154
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(0.0),
|
|
155
|
+
Stat(MetricName("efficiency_alignment_score")).add(0.0),
|
|
156
|
+
]
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return stats
|
|
160
|
+
|
|
161
|
+
def _timed_run(self, evaluator: CPPEvaluator, code: str, num_runtime_runs: int = 1) -> Tuple[Dict[str, Any], float]:
|
|
162
|
+
list_runtimes: List[float] = []
|
|
163
|
+
last_output: Dict[str, Any] = {}
|
|
164
|
+
|
|
165
|
+
for _ in range(num_runtime_runs):
|
|
166
|
+
start_time = time.perf_counter()
|
|
167
|
+
output = evaluator.evaluate(code)
|
|
168
|
+
passed = sum(output.get("testcases", []))
|
|
169
|
+
|
|
170
|
+
if passed > 0:
|
|
171
|
+
elapsed = time.perf_counter() - start_time
|
|
172
|
+
list_runtimes.append(elapsed / passed)
|
|
173
|
+
last_output = output
|
|
174
|
+
# if passed == 0, we simply skip recording this run
|
|
175
|
+
|
|
176
|
+
avg_runtime = sum(list_runtimes) / len(list_runtimes) if list_runtimes else 0.0
|
|
177
|
+
return last_output, avg_runtime
|
|
178
|
+
|
|
179
|
+
def _create_failure_stats(self, error_message: str) -> List[Stat]:
|
|
180
|
+
"""Create default statistics for failure cases."""
|
|
181
|
+
print(f"RUNTIME EFFICIENCY METRIC FAILURE: {error_message}")
|
|
182
|
+
return [
|
|
183
|
+
Stat(MetricName("functional_correctness")).add(0.0),
|
|
184
|
+
Stat(MetricName("runtime_efficiency_ratio")).add(0.0),
|
|
185
|
+
Stat(MetricName("efficiency_alignment_score")).add(0.0),
|
|
186
|
+
]
|
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import re
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
import clang.cindex
|
|
8
|
+
from clang.cindex import CursorKind
|
|
9
|
+
from Levenshtein import ratio as levenshtein_distance_ratio
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import torch
|
|
13
|
+
import torch.nn.functional as F
|
|
14
|
+
from transformers import RobertaTokenizer, RobertaModel
|
|
15
|
+
|
|
16
|
+
CODEBERT_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
CODEBERT_AVAILABLE = False
|
|
19
|
+
print("Warning: CodeBERT dependencies not available. Install with: pip install torch transformers")
|
|
20
|
+
|
|
21
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
22
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
23
|
+
from helm.benchmark.metrics.metric import Metric
|
|
24
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
25
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
26
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
27
|
+
from helm.benchmark.metrics.codeinsights_correct_code_metrics import (
|
|
28
|
+
CodeInsightsFunctionalCorrectnessMetric,
|
|
29
|
+
CPPEvaluator,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _cpp_to_asm(src: str, compiler: str = "g++") -> str:
|
|
34
|
+
"""Return the assembly text for `src`, or '' if the compile fails."""
|
|
35
|
+
|
|
36
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False) as f:
|
|
37
|
+
f.write(src)
|
|
38
|
+
cpp_path = f.name
|
|
39
|
+
asm_path = cpp_path.replace(".cpp", ".s")
|
|
40
|
+
try:
|
|
41
|
+
subprocess.run(
|
|
42
|
+
[compiler, "-std=c++17", "-S", "-o", asm_path, cpp_path],
|
|
43
|
+
check=True,
|
|
44
|
+
capture_output=True,
|
|
45
|
+
text=True,
|
|
46
|
+
timeout=30,
|
|
47
|
+
)
|
|
48
|
+
temp_file_name = os.path.basename(asm_path)
|
|
49
|
+
with open(asm_path, "r") as fh:
|
|
50
|
+
asm_code = fh.read()
|
|
51
|
+
asm_code = asm_code.replace(temp_file_name, "asm_output.cpp") # Normalize file name in output
|
|
52
|
+
return asm_code
|
|
53
|
+
|
|
54
|
+
except subprocess.CalledProcessError as e:
|
|
55
|
+
print("⚠️ Assembly compilation failed:", e.stderr[:200])
|
|
56
|
+
return ""
|
|
57
|
+
finally:
|
|
58
|
+
try:
|
|
59
|
+
os.unlink(cpp_path)
|
|
60
|
+
if os.path.exists(asm_path):
|
|
61
|
+
os.unlink(asm_path)
|
|
62
|
+
except Exception:
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ASTAnalyzer:
|
|
67
|
+
"""Class for calculating AST edit distances between two C++ code snippets using libclang."""
|
|
68
|
+
|
|
69
|
+
def __init__(self, clang_lib_path: str = ""):
|
|
70
|
+
"""
|
|
71
|
+
If libclang isn't on your LD_LIBRARY_PATH, pass its full path here.
|
|
72
|
+
e.g. '/usr/lib/llvm-14/lib/libclang.so'
|
|
73
|
+
"""
|
|
74
|
+
if clang_lib_path:
|
|
75
|
+
clang.cindex.Config.set_library_file(clang_lib_path)
|
|
76
|
+
self.index = clang.cindex.Index.create()
|
|
77
|
+
|
|
78
|
+
def calculate_ast_distance(self, code1: str, code2: str) -> float:
|
|
79
|
+
"""
|
|
80
|
+
Calculate normalized AST edit distance between two C++ code snippets.
|
|
81
|
+
Returns a float in [0,1], where 0 means identical ASTs and 1 means completely different
|
|
82
|
+
(or a parse failure).
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
tu1 = self.index.parse(path="code1.cpp", args=["-std=c++17"], unsaved_files=[("code1.cpp", code1)])
|
|
86
|
+
tu2 = self.index.parse(path="code2.cpp", args=["-std=c++17"], unsaved_files=[("code2.cpp", code2)])
|
|
87
|
+
|
|
88
|
+
nodes1: List[str] = self._extract_ast_features(tu1.cursor)
|
|
89
|
+
nodes2: List[str] = self._extract_ast_features(tu2.cursor)
|
|
90
|
+
|
|
91
|
+
return levenshtein_distance_ratio(nodes1, nodes2)
|
|
92
|
+
|
|
93
|
+
except Exception:
|
|
94
|
+
# any parse error or clang error → max distance
|
|
95
|
+
return 1.0
|
|
96
|
+
|
|
97
|
+
def _extract_ast_features(self, node: clang.cindex.Cursor) -> List[str]:
|
|
98
|
+
"""Recursively walk Clang AST, appending feature strings to feats."""
|
|
99
|
+
feats: List[str] = []
|
|
100
|
+
# record the node kind
|
|
101
|
+
feats.append(node.kind.name)
|
|
102
|
+
|
|
103
|
+
# some node-specific details
|
|
104
|
+
if node.kind == CursorKind.FUNCTION_DECL:
|
|
105
|
+
feats.append(f"Function:{node.spelling}")
|
|
106
|
+
elif node.kind == CursorKind.DECL_REF_EXPR:
|
|
107
|
+
feats.append(f"Ref:{node.spelling}")
|
|
108
|
+
elif node.kind in (
|
|
109
|
+
CursorKind.INTEGER_LITERAL,
|
|
110
|
+
CursorKind.FLOATING_LITERAL,
|
|
111
|
+
CursorKind.STRING_LITERAL,
|
|
112
|
+
CursorKind.CHARACTER_LITERAL,
|
|
113
|
+
):
|
|
114
|
+
# get literal token text
|
|
115
|
+
tokens = list(node.get_tokens())
|
|
116
|
+
if tokens:
|
|
117
|
+
feats.append(f"Literal:{tokens[0].spelling}")
|
|
118
|
+
else:
|
|
119
|
+
# for other nodes, just use the spelling if available
|
|
120
|
+
if node.spelling:
|
|
121
|
+
feats.append(f"Other:{node.spelling}")
|
|
122
|
+
|
|
123
|
+
# recurse
|
|
124
|
+
for child in node.get_children():
|
|
125
|
+
feats = feats + self._extract_ast_features(child)
|
|
126
|
+
|
|
127
|
+
return feats
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class CodeBERTAnalyzer:
|
|
131
|
+
"""Utility class for calculating semantic code similarity using CodeBERT."""
|
|
132
|
+
|
|
133
|
+
def __init__(self):
|
|
134
|
+
if not CODEBERT_AVAILABLE:
|
|
135
|
+
raise ImportError("CodeBERT dependencies not available. Install with: pip install torch transformers")
|
|
136
|
+
|
|
137
|
+
# Initialize CodeBERT model and tokenizer
|
|
138
|
+
self.model_name = "microsoft/codebert-base"
|
|
139
|
+
self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
|
|
140
|
+
self.model = RobertaModel.from_pretrained(self.model_name)
|
|
141
|
+
self.model.eval() # Set to evaluation mode
|
|
142
|
+
|
|
143
|
+
def strip_code_fence(self, code: str) -> str:
|
|
144
|
+
"""Remove code fence markers from code strings."""
|
|
145
|
+
# Remove ```python, ```cpp, ``` etc.
|
|
146
|
+
code = re.sub(r"^```\w*\n", "", code, flags=re.MULTILINE)
|
|
147
|
+
code = re.sub(r"\n```$", "", code, flags=re.MULTILINE)
|
|
148
|
+
return code.strip()
|
|
149
|
+
|
|
150
|
+
def get_code_embedding(self, code: str, max_length: int = 512) -> torch.Tensor:
|
|
151
|
+
"""Compute fixed-size embedding vector for code using CodeBERT."""
|
|
152
|
+
inputs = self.tokenizer(code, return_tensors="pt", truncation=True, max_length=max_length, padding="max_length")
|
|
153
|
+
|
|
154
|
+
attention_mask = inputs.attention_mask
|
|
155
|
+
|
|
156
|
+
with torch.no_grad():
|
|
157
|
+
outputs = self.model(**inputs)
|
|
158
|
+
last_hidden = outputs.last_hidden_state
|
|
159
|
+
|
|
160
|
+
# Apply attention mask and average valid token embeddings
|
|
161
|
+
mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
|
|
162
|
+
summed = torch.sum(last_hidden * mask, dim=1)
|
|
163
|
+
counts = torch.clamp(mask.sum(dim=1), min=1e-9)
|
|
164
|
+
embedding = summed / counts
|
|
165
|
+
|
|
166
|
+
return embedding.squeeze(0)
|
|
167
|
+
|
|
168
|
+
def calculate_embedding_similarity(self, code1: str, code2: str) -> float:
|
|
169
|
+
"""Calculate cosine similarity between code embeddings."""
|
|
170
|
+
clean_code1 = self.strip_code_fence(code1)
|
|
171
|
+
clean_code2 = self.strip_code_fence(code2)
|
|
172
|
+
|
|
173
|
+
emb1 = self.get_code_embedding(clean_code1)
|
|
174
|
+
emb2 = self.get_code_embedding(clean_code2)
|
|
175
|
+
|
|
176
|
+
cosine_sim = F.cosine_similarity(emb1, emb2, dim=0).item()
|
|
177
|
+
return cosine_sim
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class CodeInsightsCodeEvaluationMetric(Metric):
|
|
181
|
+
"""Metric for evaluating code generation quality using AST analysis and CodeBERT similarity."""
|
|
182
|
+
|
|
183
|
+
def __init__(self, use_codebert: bool = True):
|
|
184
|
+
self.ast_analyzer = ASTAnalyzer()
|
|
185
|
+
self.use_codebert = use_codebert and CODEBERT_AVAILABLE
|
|
186
|
+
|
|
187
|
+
if self.use_codebert:
|
|
188
|
+
try:
|
|
189
|
+
self.codebert_analyzer = CodeBERTAnalyzer()
|
|
190
|
+
except Exception as e:
|
|
191
|
+
print(f"Warning: Failed to initialize CodeBERT analyzer: {e}")
|
|
192
|
+
self.use_codebert = False
|
|
193
|
+
|
|
194
|
+
def evaluate_generation(
|
|
195
|
+
self,
|
|
196
|
+
adapter_spec: AdapterSpec,
|
|
197
|
+
request_state: RequestState,
|
|
198
|
+
metric_service: MetricService,
|
|
199
|
+
eval_cache_path: str,
|
|
200
|
+
) -> List[Stat]:
|
|
201
|
+
"""Evaluate a single generated code snippet."""
|
|
202
|
+
stats = []
|
|
203
|
+
|
|
204
|
+
# Get the generated code from the request state
|
|
205
|
+
if not request_state.result or not request_state.result.completions:
|
|
206
|
+
return self._create_default_stats(0.5)
|
|
207
|
+
|
|
208
|
+
generated_code = request_state.result.completions[0].text.strip()
|
|
209
|
+
generated_code = self._extract_student_code(generated_code)
|
|
210
|
+
|
|
211
|
+
# Get the ground truth from the instance references
|
|
212
|
+
if not request_state.instance.references:
|
|
213
|
+
return self._create_default_stats(0.5)
|
|
214
|
+
|
|
215
|
+
ground_truth = request_state.instance.references[0].output.text.strip()
|
|
216
|
+
|
|
217
|
+
# Calculate AST distance
|
|
218
|
+
if not generated_code or "Error:" in generated_code:
|
|
219
|
+
ast_distance = 1.0
|
|
220
|
+
else:
|
|
221
|
+
try:
|
|
222
|
+
ast_distance = self.ast_analyzer.calculate_ast_distance(generated_code, ground_truth)
|
|
223
|
+
except Exception:
|
|
224
|
+
ast_distance = 1.0
|
|
225
|
+
|
|
226
|
+
# Create AST-based statistics
|
|
227
|
+
stats.extend(self._create_ast_stats(ast_distance))
|
|
228
|
+
|
|
229
|
+
# Calculate assembly distance
|
|
230
|
+
gen_asm = _cpp_to_asm(generated_code)
|
|
231
|
+
truth_asm = _cpp_to_asm(ground_truth)
|
|
232
|
+
if gen_asm == "" or truth_asm == "":
|
|
233
|
+
asm_distance = 1.0
|
|
234
|
+
else:
|
|
235
|
+
asm_distance = levenshtein_distance_ratio(gen_asm, truth_asm)
|
|
236
|
+
|
|
237
|
+
# Create assembly-based statistics
|
|
238
|
+
stats.extend(self._create_asm_stats(asm_distance))
|
|
239
|
+
|
|
240
|
+
# Calculate CodeBERT similarity if available
|
|
241
|
+
if self.use_codebert:
|
|
242
|
+
try:
|
|
243
|
+
codebert_similarity = self.codebert_analyzer.calculate_embedding_similarity(
|
|
244
|
+
generated_code, ground_truth
|
|
245
|
+
)
|
|
246
|
+
stats.extend(self._create_codebert_stats(codebert_similarity))
|
|
247
|
+
except Exception as e:
|
|
248
|
+
print(f"Warning: CodeBERT similarity calculation failed: {e}")
|
|
249
|
+
# Add default CodeBERT stats for failed calculations
|
|
250
|
+
stats.extend(self._create_codebert_stats(0.0))
|
|
251
|
+
|
|
252
|
+
return stats
|
|
253
|
+
|
|
254
|
+
def _extract_student_code(self, model_code: str) -> str:
|
|
255
|
+
"""
|
|
256
|
+
Extracts clean C++ code from model output:
|
|
257
|
+
- Trims preambles
|
|
258
|
+
- Removes student's main()
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
code_blocks = re.findall(r"```(?:c\+\+)?\n(.*?)```", model_code, flags=re.DOTALL)
|
|
262
|
+
if code_blocks:
|
|
263
|
+
model_code = code_blocks[0].strip() # Use the first code block
|
|
264
|
+
print("[Markdown extraction] Used fenced code blocks.")
|
|
265
|
+
|
|
266
|
+
# Post-processing
|
|
267
|
+
lines = model_code.strip().splitlines()
|
|
268
|
+
start_keywords = ("#include", "using namespace")
|
|
269
|
+
for i, line in enumerate(lines):
|
|
270
|
+
if any(line.strip().startswith(k) for k in start_keywords):
|
|
271
|
+
lines[i] = ""
|
|
272
|
+
|
|
273
|
+
code = "\n".join(lines).strip()
|
|
274
|
+
if "int main" in code:
|
|
275
|
+
code = code.split("int main")[0].strip()
|
|
276
|
+
|
|
277
|
+
# --- Final touch ---
|
|
278
|
+
if "print(" in code and "void print()" not in code and "print()" not in code:
|
|
279
|
+
print("⚠️ WARNING: `print()` is called in test input but not defined.")
|
|
280
|
+
|
|
281
|
+
print(f"[Final extracted code length] {len(code)}")
|
|
282
|
+
print(f"[Code preview]\n{code[:300]}...\n")
|
|
283
|
+
return code
|
|
284
|
+
|
|
285
|
+
def _create_default_stats(self, distance: float) -> List[Stat]:
|
|
286
|
+
"""Create default statistics for error cases."""
|
|
287
|
+
stats = self._create_ast_stats(distance)
|
|
288
|
+
if self.use_codebert:
|
|
289
|
+
stats.extend(self._create_codebert_stats(0.0))
|
|
290
|
+
return stats
|
|
291
|
+
|
|
292
|
+
def _create_ast_stats(self, ast_distance: float) -> List[Stat]:
|
|
293
|
+
"""Create AST-based statistics."""
|
|
294
|
+
return [Stat(MetricName("ast_distance")).add(ast_distance)]
|
|
295
|
+
|
|
296
|
+
def _create_codebert_stats(self, codebert_similarity: float) -> List[Stat]:
|
|
297
|
+
"""Create CodeBERT-based statistics."""
|
|
298
|
+
return [Stat(MetricName("codebert_similarity")).add(codebert_similarity)]
|
|
299
|
+
|
|
300
|
+
def _create_asm_stats(self, asm_distance: float) -> List[Stat]:
|
|
301
|
+
"""Create assembly-based statistics."""
|
|
302
|
+
return [Stat(MetricName("asm_distance")).add(asm_distance)]
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class AdvancedCodeEvaluationMetric(CodeInsightsCodeEvaluationMetric):
|
|
306
|
+
"""Extended code evaluation metric with additional analyses"""
|
|
307
|
+
|
|
308
|
+
def __init__(self, use_codebert: bool = True):
|
|
309
|
+
super().__init__(use_codebert=use_codebert)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
class UnitTestAlignmentMetric(CodeInsightsFunctionalCorrectnessMetric):
|
|
313
|
+
"""Metric for evaluating C++ code generation by comparing unit test results with student correctness pattern."""
|
|
314
|
+
|
|
315
|
+
def _calculate_alignment_metrics(self, llm_pattern: List[int], student_pattern: List[int]) -> List[Stat]:
|
|
316
|
+
"""
|
|
317
|
+
Calculate alignment metrics between LLM and student correctness patterns.
|
|
318
|
+
"""
|
|
319
|
+
# Ensure patterns have same length (pad with 0s if needed)
|
|
320
|
+
max_length = max(len(llm_pattern), len(student_pattern))
|
|
321
|
+
llm_padded = llm_pattern + [0] * (max_length - len(llm_pattern))
|
|
322
|
+
student_padded = student_pattern + [0] * (max_length - len(student_pattern))
|
|
323
|
+
|
|
324
|
+
# Calculate alignment metrics
|
|
325
|
+
total_tests = max_length
|
|
326
|
+
exact_matches = sum(1 for i in range(total_tests) if llm_padded[i] == student_padded[i])
|
|
327
|
+
|
|
328
|
+
# Alignment ratio (percentage of matching tests)
|
|
329
|
+
alignment_ratio = exact_matches / total_tests if total_tests > 0 else 0.0
|
|
330
|
+
|
|
331
|
+
# Calculate LLM and student pass rates
|
|
332
|
+
llm_pass_rate = sum(llm_padded) / total_tests if total_tests > 0 else 0.0
|
|
333
|
+
student_pass_rate = sum(student_padded) / total_tests if total_tests > 0 else 0.0
|
|
334
|
+
|
|
335
|
+
print(f"Alignment calculation: {exact_matches}/{total_tests} = {alignment_ratio}")
|
|
336
|
+
|
|
337
|
+
return [
|
|
338
|
+
Stat(MetricName("unittest_alignment_ratio")).add(alignment_ratio),
|
|
339
|
+
Stat(MetricName("unittest_llm_pass_rate")).add(llm_pass_rate),
|
|
340
|
+
Stat(MetricName("unittest_student_pass_rate")).add(student_pass_rate),
|
|
341
|
+
]
|
|
342
|
+
|
|
343
|
+
def evaluate_generation(
|
|
344
|
+
self,
|
|
345
|
+
adapter_spec: AdapterSpec,
|
|
346
|
+
request_state: RequestState,
|
|
347
|
+
metric_service: MetricService,
|
|
348
|
+
eval_cache_path: str,
|
|
349
|
+
) -> List[Stat]:
|
|
350
|
+
"""
|
|
351
|
+
Evaluate LLM-generated code by running unit tests and computing pass rate.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
List of Stat objects containing the functional correctness score
|
|
355
|
+
"""
|
|
356
|
+
print("\n=== FUNCTIONAL CORRECTNESS METRIC DEBUG ===")
|
|
357
|
+
print(f"Instance ID: {getattr(request_state.instance, 'id', 'UNKNOWN')}")
|
|
358
|
+
|
|
359
|
+
# Get the generated code from the request state
|
|
360
|
+
if not request_state.result or not request_state.result.completions:
|
|
361
|
+
print("ERROR: No output generated")
|
|
362
|
+
return self._create_failure_stats("No output generated")
|
|
363
|
+
|
|
364
|
+
generated_code = request_state.result.completions[0].text.strip()
|
|
365
|
+
generated_code = self._extract_student_code(generated_code)
|
|
366
|
+
print(f"Generated code length: {len(generated_code)}")
|
|
367
|
+
print(f"Generated code preview: {generated_code[:200]}...")
|
|
368
|
+
|
|
369
|
+
# Get test cases from instance extra_data
|
|
370
|
+
if not hasattr(request_state.instance, "extra_data") or not request_state.instance.extra_data:
|
|
371
|
+
print("ERROR: No extra_data available")
|
|
372
|
+
print(f"Instance attributes: {dir(request_state.instance)}")
|
|
373
|
+
return self._create_failure_stats("No test data available")
|
|
374
|
+
|
|
375
|
+
extra_data = request_state.instance.extra_data
|
|
376
|
+
print(f"Extra data keys: {list(extra_data.keys())}")
|
|
377
|
+
|
|
378
|
+
test_cases = extra_data.get("test_cases", [])
|
|
379
|
+
student_correctness_pattern = extra_data.get("student_correctness_pattern", [])
|
|
380
|
+
question_template = extra_data.get("question_template", "")
|
|
381
|
+
question_name = extra_data.get("question_name", "UNKNOWN")
|
|
382
|
+
|
|
383
|
+
print(f"Question name: {question_name}")
|
|
384
|
+
print(f"Number of test cases: {len(test_cases)}")
|
|
385
|
+
print(f"Template length: {len(question_template)}")
|
|
386
|
+
|
|
387
|
+
if not test_cases:
|
|
388
|
+
print("ERROR: No test cases available")
|
|
389
|
+
return self._create_failure_stats("No test cases available")
|
|
390
|
+
|
|
391
|
+
print(f"First test case preview: {test_cases[0] if test_cases else 'NONE'}")
|
|
392
|
+
|
|
393
|
+
# Run unit tests and calculate pass rate
|
|
394
|
+
evaluator = CPPEvaluator(question_template, test_cases, timeout=self.timeout, max_workers=self.max_workers)
|
|
395
|
+
llm_correctness_pattern = evaluator.evaluate(generated_code)["testcases"]
|
|
396
|
+
print(f"LLM correctness pattern: {llm_correctness_pattern}")
|
|
397
|
+
|
|
398
|
+
# Compare patterns and calculate alignment metrics
|
|
399
|
+
alignment_stats = self._calculate_alignment_metrics(llm_correctness_pattern, student_correctness_pattern)
|
|
400
|
+
|
|
401
|
+
print(f"Final alignment stats: {[stat.name.name for stat in alignment_stats]}")
|
|
402
|
+
print("=== END UNIT TEST ALIGNMENT DEBUG ===\n")
|
|
403
|
+
|
|
404
|
+
return alignment_stats
|
|
405
|
+
|
|
406
|
+
def _create_failure_stats(self, error_message: str) -> List[Stat]:
|
|
407
|
+
"""Create default statistics for failure cases."""
|
|
408
|
+
print(f"UNIT TEST ALIGNMENT METRIC FAILURE: {error_message}")
|
|
409
|
+
return [
|
|
410
|
+
Stat(MetricName("unittest_alignment_ratio")).add(0.0),
|
|
411
|
+
Stat(MetricName("unittest_llm_pass_rate")).add(0.0),
|
|
412
|
+
Stat(MetricName("unittest_student_pass_rate")).add(0.0),
|
|
413
|
+
]
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
class CodeInsightsComprehensiveCodeEvaluationMetric(CodeInsightsCodeEvaluationMetric):
|
|
417
|
+
"""Comprehensive metric combining AST, CodeBERT, and unit test alignment."""
|
|
418
|
+
|
|
419
|
+
def __init__(self, use_codebert: bool = True):
|
|
420
|
+
super().__init__(use_codebert=use_codebert)
|
|
421
|
+
self.unittest_metric = UnitTestAlignmentMetric()
|
|
422
|
+
|
|
423
|
+
def evaluate_generation(
|
|
424
|
+
self,
|
|
425
|
+
adapter_spec: AdapterSpec,
|
|
426
|
+
request_state: RequestState,
|
|
427
|
+
metric_service: MetricService,
|
|
428
|
+
eval_cache_path: str,
|
|
429
|
+
) -> List[Stat]:
|
|
430
|
+
"""Evaluate with AST, CodeBERT, and unit test alignment metrics."""
|
|
431
|
+
|
|
432
|
+
# Get base AST and CodeBERT metrics
|
|
433
|
+
stats = super().evaluate_generation(adapter_spec, request_state, metric_service, eval_cache_path)
|
|
434
|
+
|
|
435
|
+
# Add unit test alignment metrics
|
|
436
|
+
unittest_stats = self.unittest_metric.evaluate_generation(
|
|
437
|
+
adapter_spec, request_state, metric_service, eval_cache_path
|
|
438
|
+
)
|
|
439
|
+
stats.extend(unittest_stats)
|
|
440
|
+
return stats
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
# Legacy method for batch evaluation (if needed for backward compatibility)
|
|
444
|
+
def evaluate_ast_distances_batch(results: Dict, analyzer: ASTAnalyzer) -> pd.DataFrame:
|
|
445
|
+
"""
|
|
446
|
+
Legacy batch evaluation method for AST distances.
|
|
447
|
+
This can be used outside of HELM if needed.
|
|
448
|
+
"""
|
|
449
|
+
all_rows = []
|
|
450
|
+
|
|
451
|
+
for student_id, info in results.items():
|
|
452
|
+
ground_truth = info["ground_truth"]
|
|
453
|
+
|
|
454
|
+
for model_name, generated_code in info["outputs"].items():
|
|
455
|
+
if "Error:" in generated_code:
|
|
456
|
+
normalized_distance = 1.0 # Maximum distance for errors
|
|
457
|
+
else:
|
|
458
|
+
try:
|
|
459
|
+
normalized_distance = analyzer.calculate_ast_distance(generated_code, ground_truth)
|
|
460
|
+
except Exception:
|
|
461
|
+
normalized_distance = 1.0
|
|
462
|
+
|
|
463
|
+
all_rows.append(
|
|
464
|
+
{
|
|
465
|
+
"student_id": student_id,
|
|
466
|
+
"question_id": info["question_id"],
|
|
467
|
+
"model": model_name,
|
|
468
|
+
"normalized_distance": normalized_distance,
|
|
469
|
+
}
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
df_long = pd.DataFrame(all_rows)
|
|
473
|
+
df_ast = df_long.pivot(
|
|
474
|
+
index=["student_id", "question_id"], columns="model", values="normalized_distance"
|
|
475
|
+
).reset_index()
|
|
476
|
+
|
|
477
|
+
return df_ast
|