crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,186 @@
1
+ from typing import List, Tuple, Dict, Any
2
+ import time
3
+
4
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+ from helm.benchmark.metrics.codeinsights_correct_code_metrics import (
10
+ CodeInsightsFunctionalCorrectnessMetric,
11
+ CPPEvaluator,
12
+ )
13
+
14
+
15
+ class CodeInsightsCodeEfficiencyMetric(CodeInsightsFunctionalCorrectnessMetric):
16
+ """
17
+ Comprehensive metric combining functional correctness and runtime efficiency evaluation.
18
+
19
+ This metric first evaluates functional correctness and then measures runtime efficiency
20
+ alignment between LLM-generated code and student reference code when both are correct.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ num_runtime_runs: int = 5,
26
+ timeout_seconds: int = 10,
27
+ ):
28
+ """
29
+ Initializes the CodeInsightsFunctionalCorrectnessMetric.
30
+
31
+ Args:
32
+ timeout (int): Timeout for each test case execution.
33
+ """
34
+ super().__init__()
35
+ self.num_runtime_runs = num_runtime_runs
36
+ self.timeout_seconds = timeout_seconds
37
+
38
+ def evaluate_generation(
39
+ self,
40
+ adapter_spec: AdapterSpec,
41
+ request_state: RequestState,
42
+ metric_service: MetricService,
43
+ eval_cache_path: str,
44
+ ) -> List[Stat]:
45
+ """
46
+ Evaluate LLM-generated code by running unit tests and computing pass rate.
47
+
48
+ Returns:
49
+ List of Stat objects containing the functional correctness score
50
+ """
51
+ print("\n=== FUNCTIONAL CORRECTNESS METRIC DEBUG ===")
52
+ print(f"Instance ID: {getattr(request_state.instance, 'id', 'UNKNOWN')}")
53
+
54
+ # Get the generated code from the request state
55
+ if not request_state.result or not request_state.result.completions:
56
+ print("ERROR: No output generated")
57
+ return self._create_failure_stats("No output generated")
58
+
59
+ generated_code = request_state.result.completions[0].text.strip()
60
+ generated_code = self._extract_student_code(generated_code)
61
+ print(f"Generated code length: {len(generated_code)}")
62
+ print(f"Generated code preview: {generated_code[:200]}...")
63
+
64
+ # Get the student code from the instance references
65
+ student_code = request_state.instance.references[0].output.text.strip()
66
+ print(f"Student code length: {len(student_code)}")
67
+
68
+ # Get test cases from instance extra_data
69
+ if not hasattr(request_state.instance, "extra_data") or not request_state.instance.extra_data:
70
+ print("ERROR: No extra_data available")
71
+ print(f"Instance attributes: {dir(request_state.instance)}")
72
+ return self._create_failure_stats("No test data available")
73
+
74
+ extra_data = request_state.instance.extra_data
75
+ print(f"Extra data keys: {list(extra_data.keys())}")
76
+
77
+ test_cases = extra_data.get("test_cases", [])
78
+ question_template = extra_data.get("question_template", "")
79
+ question_name = extra_data.get("question_name", "UNKNOWN")
80
+
81
+ print(f"Question name: {question_name}")
82
+ print(f"Number of test cases: {len(test_cases)}")
83
+ print(f"Template length: {len(question_template)}")
84
+
85
+ if not test_cases:
86
+ print("ERROR: No test cases available")
87
+ return self._create_failure_stats("No test cases available")
88
+
89
+ print(f"First test case preview: {test_cases[0] if test_cases else 'NONE'}")
90
+
91
+ # Run unit tests and calculate pass rate
92
+ evaluator = CPPEvaluator(
93
+ question_template,
94
+ test_cases,
95
+ timeout=self.timeout_seconds,
96
+ max_workers=1,
97
+ )
98
+
99
+ llm_output, llm_avg_runtime = self._timed_run(evaluator, generated_code, self.num_runtime_runs)
100
+ stu_output, stu_avg_runtime = self._timed_run(evaluator, student_code, self.num_runtime_runs)
101
+
102
+ # Compute functional correctness score
103
+ if not llm_output or "score" not in llm_output:
104
+ stats = [Stat(MetricName("functional_correctness")).add(0.0)]
105
+ else:
106
+ stats = [Stat(MetricName("functional_correctness")).add(llm_output["score"])]
107
+
108
+ # Calculate runtime metrics if we have data for both solutions
109
+ if llm_avg_runtime > 0 and stu_avg_runtime > 0:
110
+ # Runtime ratio (LLM / Student) - values > 1 mean LLM is slower
111
+ runtime_ratio = llm_avg_runtime / stu_avg_runtime if stu_avg_runtime > 0 else float("inf")
112
+
113
+ # Efficiency alignment score (closer to 1.0 is better alignment)
114
+ # Use reciprocal if LLM is faster to normalize the scale
115
+ if runtime_ratio > 1:
116
+ efficiency_alignment = 1.0 / runtime_ratio
117
+ else:
118
+ efficiency_alignment = runtime_ratio
119
+
120
+ print(f"Runtime ratio (LLM/Student): {runtime_ratio:.4f}")
121
+ print(f"Efficiency alignment score: {efficiency_alignment:.4f}")
122
+
123
+ stats.extend(
124
+ [
125
+ Stat(MetricName("runtime_efficiency_ratio")).add(runtime_ratio),
126
+ Stat(MetricName("efficiency_alignment_score")).add(efficiency_alignment),
127
+ ]
128
+ )
129
+
130
+ # Handle cases where only one solution has runtime data
131
+ elif llm_avg_runtime > 0 and stu_avg_runtime <= 0:
132
+ print("Only LLM runtime available - student solution failed to run")
133
+ stats.extend(
134
+ [
135
+ Stat(MetricName("runtime_efficiency_ratio")).add(float("inf")), # LLM runs, student doesn't
136
+ Stat(MetricName("efficiency_alignment_score")).add(0.0), # No alignment possible
137
+ ]
138
+ )
139
+
140
+ elif llm_avg_runtime <= 0 and stu_avg_runtime > 0:
141
+ print("Only student runtime available - LLM solution failed to run")
142
+ stats.extend(
143
+ [
144
+ Stat(MetricName("runtime_efficiency_ratio")).add(0.0), # Student runs, LLM doesn't
145
+ Stat(MetricName("efficiency_alignment_score")).add(0.0), # No alignment possible
146
+ ]
147
+ )
148
+
149
+ else:
150
+ # Neither solution has runtime data
151
+ print("Runtime measurement failed for both solutions")
152
+ stats.extend(
153
+ [
154
+ Stat(MetricName("runtime_efficiency_ratio")).add(0.0),
155
+ Stat(MetricName("efficiency_alignment_score")).add(0.0),
156
+ ]
157
+ )
158
+
159
+ return stats
160
+
161
+ def _timed_run(self, evaluator: CPPEvaluator, code: str, num_runtime_runs: int = 1) -> Tuple[Dict[str, Any], float]:
162
+ list_runtimes: List[float] = []
163
+ last_output: Dict[str, Any] = {}
164
+
165
+ for _ in range(num_runtime_runs):
166
+ start_time = time.perf_counter()
167
+ output = evaluator.evaluate(code)
168
+ passed = sum(output.get("testcases", []))
169
+
170
+ if passed > 0:
171
+ elapsed = time.perf_counter() - start_time
172
+ list_runtimes.append(elapsed / passed)
173
+ last_output = output
174
+ # if passed == 0, we simply skip recording this run
175
+
176
+ avg_runtime = sum(list_runtimes) / len(list_runtimes) if list_runtimes else 0.0
177
+ return last_output, avg_runtime
178
+
179
+ def _create_failure_stats(self, error_message: str) -> List[Stat]:
180
+ """Create default statistics for failure cases."""
181
+ print(f"RUNTIME EFFICIENCY METRIC FAILURE: {error_message}")
182
+ return [
183
+ Stat(MetricName("functional_correctness")).add(0.0),
184
+ Stat(MetricName("runtime_efficiency_ratio")).add(0.0),
185
+ Stat(MetricName("efficiency_alignment_score")).add(0.0),
186
+ ]
@@ -0,0 +1,477 @@
1
+ from typing import Dict, List
2
+ import pandas as pd
3
+ import re
4
+ import os
5
+ import subprocess
6
+ import tempfile
7
+ import clang.cindex
8
+ from clang.cindex import CursorKind
9
+ from Levenshtein import ratio as levenshtein_distance_ratio
10
+
11
+ try:
12
+ import torch
13
+ import torch.nn.functional as F
14
+ from transformers import RobertaTokenizer, RobertaModel
15
+
16
+ CODEBERT_AVAILABLE = True
17
+ except ImportError:
18
+ CODEBERT_AVAILABLE = False
19
+ print("Warning: CodeBERT dependencies not available. Install with: pip install torch transformers")
20
+
21
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
22
+ from helm.benchmark.adaptation.request_state import RequestState
23
+ from helm.benchmark.metrics.metric import Metric
24
+ from helm.benchmark.metrics.metric_name import MetricName
25
+ from helm.benchmark.metrics.metric_service import MetricService
26
+ from helm.benchmark.metrics.statistic import Stat
27
+ from helm.benchmark.metrics.codeinsights_correct_code_metrics import (
28
+ CodeInsightsFunctionalCorrectnessMetric,
29
+ CPPEvaluator,
30
+ )
31
+
32
+
33
+ def _cpp_to_asm(src: str, compiler: str = "g++") -> str:
34
+ """Return the assembly text for `src`, or '' if the compile fails."""
35
+
36
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False) as f:
37
+ f.write(src)
38
+ cpp_path = f.name
39
+ asm_path = cpp_path.replace(".cpp", ".s")
40
+ try:
41
+ subprocess.run(
42
+ [compiler, "-std=c++17", "-S", "-o", asm_path, cpp_path],
43
+ check=True,
44
+ capture_output=True,
45
+ text=True,
46
+ timeout=30,
47
+ )
48
+ temp_file_name = os.path.basename(asm_path)
49
+ with open(asm_path, "r") as fh:
50
+ asm_code = fh.read()
51
+ asm_code = asm_code.replace(temp_file_name, "asm_output.cpp") # Normalize file name in output
52
+ return asm_code
53
+
54
+ except subprocess.CalledProcessError as e:
55
+ print("⚠️ Assembly compilation failed:", e.stderr[:200])
56
+ return ""
57
+ finally:
58
+ try:
59
+ os.unlink(cpp_path)
60
+ if os.path.exists(asm_path):
61
+ os.unlink(asm_path)
62
+ except Exception:
63
+ pass
64
+
65
+
66
+ class ASTAnalyzer:
67
+ """Class for calculating AST edit distances between two C++ code snippets using libclang."""
68
+
69
+ def __init__(self, clang_lib_path: str = ""):
70
+ """
71
+ If libclang isn't on your LD_LIBRARY_PATH, pass its full path here.
72
+ e.g. '/usr/lib/llvm-14/lib/libclang.so'
73
+ """
74
+ if clang_lib_path:
75
+ clang.cindex.Config.set_library_file(clang_lib_path)
76
+ self.index = clang.cindex.Index.create()
77
+
78
+ def calculate_ast_distance(self, code1: str, code2: str) -> float:
79
+ """
80
+ Calculate normalized AST edit distance between two C++ code snippets.
81
+ Returns a float in [0,1], where 0 means identical ASTs and 1 means completely different
82
+ (or a parse failure).
83
+ """
84
+ try:
85
+ tu1 = self.index.parse(path="code1.cpp", args=["-std=c++17"], unsaved_files=[("code1.cpp", code1)])
86
+ tu2 = self.index.parse(path="code2.cpp", args=["-std=c++17"], unsaved_files=[("code2.cpp", code2)])
87
+
88
+ nodes1: List[str] = self._extract_ast_features(tu1.cursor)
89
+ nodes2: List[str] = self._extract_ast_features(tu2.cursor)
90
+
91
+ return levenshtein_distance_ratio(nodes1, nodes2)
92
+
93
+ except Exception:
94
+ # any parse error or clang error → max distance
95
+ return 1.0
96
+
97
+ def _extract_ast_features(self, node: clang.cindex.Cursor) -> List[str]:
98
+ """Recursively walk Clang AST, appending feature strings to feats."""
99
+ feats: List[str] = []
100
+ # record the node kind
101
+ feats.append(node.kind.name)
102
+
103
+ # some node-specific details
104
+ if node.kind == CursorKind.FUNCTION_DECL:
105
+ feats.append(f"Function:{node.spelling}")
106
+ elif node.kind == CursorKind.DECL_REF_EXPR:
107
+ feats.append(f"Ref:{node.spelling}")
108
+ elif node.kind in (
109
+ CursorKind.INTEGER_LITERAL,
110
+ CursorKind.FLOATING_LITERAL,
111
+ CursorKind.STRING_LITERAL,
112
+ CursorKind.CHARACTER_LITERAL,
113
+ ):
114
+ # get literal token text
115
+ tokens = list(node.get_tokens())
116
+ if tokens:
117
+ feats.append(f"Literal:{tokens[0].spelling}")
118
+ else:
119
+ # for other nodes, just use the spelling if available
120
+ if node.spelling:
121
+ feats.append(f"Other:{node.spelling}")
122
+
123
+ # recurse
124
+ for child in node.get_children():
125
+ feats = feats + self._extract_ast_features(child)
126
+
127
+ return feats
128
+
129
+
130
+ class CodeBERTAnalyzer:
131
+ """Utility class for calculating semantic code similarity using CodeBERT."""
132
+
133
+ def __init__(self):
134
+ if not CODEBERT_AVAILABLE:
135
+ raise ImportError("CodeBERT dependencies not available. Install with: pip install torch transformers")
136
+
137
+ # Initialize CodeBERT model and tokenizer
138
+ self.model_name = "microsoft/codebert-base"
139
+ self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
140
+ self.model = RobertaModel.from_pretrained(self.model_name)
141
+ self.model.eval() # Set to evaluation mode
142
+
143
+ def strip_code_fence(self, code: str) -> str:
144
+ """Remove code fence markers from code strings."""
145
+ # Remove ```python, ```cpp, ``` etc.
146
+ code = re.sub(r"^```\w*\n", "", code, flags=re.MULTILINE)
147
+ code = re.sub(r"\n```$", "", code, flags=re.MULTILINE)
148
+ return code.strip()
149
+
150
+ def get_code_embedding(self, code: str, max_length: int = 512) -> torch.Tensor:
151
+ """Compute fixed-size embedding vector for code using CodeBERT."""
152
+ inputs = self.tokenizer(code, return_tensors="pt", truncation=True, max_length=max_length, padding="max_length")
153
+
154
+ attention_mask = inputs.attention_mask
155
+
156
+ with torch.no_grad():
157
+ outputs = self.model(**inputs)
158
+ last_hidden = outputs.last_hidden_state
159
+
160
+ # Apply attention mask and average valid token embeddings
161
+ mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
162
+ summed = torch.sum(last_hidden * mask, dim=1)
163
+ counts = torch.clamp(mask.sum(dim=1), min=1e-9)
164
+ embedding = summed / counts
165
+
166
+ return embedding.squeeze(0)
167
+
168
+ def calculate_embedding_similarity(self, code1: str, code2: str) -> float:
169
+ """Calculate cosine similarity between code embeddings."""
170
+ clean_code1 = self.strip_code_fence(code1)
171
+ clean_code2 = self.strip_code_fence(code2)
172
+
173
+ emb1 = self.get_code_embedding(clean_code1)
174
+ emb2 = self.get_code_embedding(clean_code2)
175
+
176
+ cosine_sim = F.cosine_similarity(emb1, emb2, dim=0).item()
177
+ return cosine_sim
178
+
179
+
180
+ class CodeInsightsCodeEvaluationMetric(Metric):
181
+ """Metric for evaluating code generation quality using AST analysis and CodeBERT similarity."""
182
+
183
+ def __init__(self, use_codebert: bool = True):
184
+ self.ast_analyzer = ASTAnalyzer()
185
+ self.use_codebert = use_codebert and CODEBERT_AVAILABLE
186
+
187
+ if self.use_codebert:
188
+ try:
189
+ self.codebert_analyzer = CodeBERTAnalyzer()
190
+ except Exception as e:
191
+ print(f"Warning: Failed to initialize CodeBERT analyzer: {e}")
192
+ self.use_codebert = False
193
+
194
+ def evaluate_generation(
195
+ self,
196
+ adapter_spec: AdapterSpec,
197
+ request_state: RequestState,
198
+ metric_service: MetricService,
199
+ eval_cache_path: str,
200
+ ) -> List[Stat]:
201
+ """Evaluate a single generated code snippet."""
202
+ stats = []
203
+
204
+ # Get the generated code from the request state
205
+ if not request_state.result or not request_state.result.completions:
206
+ return self._create_default_stats(0.5)
207
+
208
+ generated_code = request_state.result.completions[0].text.strip()
209
+ generated_code = self._extract_student_code(generated_code)
210
+
211
+ # Get the ground truth from the instance references
212
+ if not request_state.instance.references:
213
+ return self._create_default_stats(0.5)
214
+
215
+ ground_truth = request_state.instance.references[0].output.text.strip()
216
+
217
+ # Calculate AST distance
218
+ if not generated_code or "Error:" in generated_code:
219
+ ast_distance = 1.0
220
+ else:
221
+ try:
222
+ ast_distance = self.ast_analyzer.calculate_ast_distance(generated_code, ground_truth)
223
+ except Exception:
224
+ ast_distance = 1.0
225
+
226
+ # Create AST-based statistics
227
+ stats.extend(self._create_ast_stats(ast_distance))
228
+
229
+ # Calculate assembly distance
230
+ gen_asm = _cpp_to_asm(generated_code)
231
+ truth_asm = _cpp_to_asm(ground_truth)
232
+ if gen_asm == "" or truth_asm == "":
233
+ asm_distance = 1.0
234
+ else:
235
+ asm_distance = levenshtein_distance_ratio(gen_asm, truth_asm)
236
+
237
+ # Create assembly-based statistics
238
+ stats.extend(self._create_asm_stats(asm_distance))
239
+
240
+ # Calculate CodeBERT similarity if available
241
+ if self.use_codebert:
242
+ try:
243
+ codebert_similarity = self.codebert_analyzer.calculate_embedding_similarity(
244
+ generated_code, ground_truth
245
+ )
246
+ stats.extend(self._create_codebert_stats(codebert_similarity))
247
+ except Exception as e:
248
+ print(f"Warning: CodeBERT similarity calculation failed: {e}")
249
+ # Add default CodeBERT stats for failed calculations
250
+ stats.extend(self._create_codebert_stats(0.0))
251
+
252
+ return stats
253
+
254
+ def _extract_student_code(self, model_code: str) -> str:
255
+ """
256
+ Extracts clean C++ code from model output:
257
+ - Trims preambles
258
+ - Removes student's main()
259
+ """
260
+
261
+ code_blocks = re.findall(r"```(?:c\+\+)?\n(.*?)```", model_code, flags=re.DOTALL)
262
+ if code_blocks:
263
+ model_code = code_blocks[0].strip() # Use the first code block
264
+ print("[Markdown extraction] Used fenced code blocks.")
265
+
266
+ # Post-processing
267
+ lines = model_code.strip().splitlines()
268
+ start_keywords = ("#include", "using namespace")
269
+ for i, line in enumerate(lines):
270
+ if any(line.strip().startswith(k) for k in start_keywords):
271
+ lines[i] = ""
272
+
273
+ code = "\n".join(lines).strip()
274
+ if "int main" in code:
275
+ code = code.split("int main")[0].strip()
276
+
277
+ # --- Final touch ---
278
+ if "print(" in code and "void print()" not in code and "print()" not in code:
279
+ print("⚠️ WARNING: `print()` is called in test input but not defined.")
280
+
281
+ print(f"[Final extracted code length] {len(code)}")
282
+ print(f"[Code preview]\n{code[:300]}...\n")
283
+ return code
284
+
285
+ def _create_default_stats(self, distance: float) -> List[Stat]:
286
+ """Create default statistics for error cases."""
287
+ stats = self._create_ast_stats(distance)
288
+ if self.use_codebert:
289
+ stats.extend(self._create_codebert_stats(0.0))
290
+ return stats
291
+
292
+ def _create_ast_stats(self, ast_distance: float) -> List[Stat]:
293
+ """Create AST-based statistics."""
294
+ return [Stat(MetricName("ast_distance")).add(ast_distance)]
295
+
296
+ def _create_codebert_stats(self, codebert_similarity: float) -> List[Stat]:
297
+ """Create CodeBERT-based statistics."""
298
+ return [Stat(MetricName("codebert_similarity")).add(codebert_similarity)]
299
+
300
+ def _create_asm_stats(self, asm_distance: float) -> List[Stat]:
301
+ """Create assembly-based statistics."""
302
+ return [Stat(MetricName("asm_distance")).add(asm_distance)]
303
+
304
+
305
+ class AdvancedCodeEvaluationMetric(CodeInsightsCodeEvaluationMetric):
306
+ """Extended code evaluation metric with additional analyses"""
307
+
308
+ def __init__(self, use_codebert: bool = True):
309
+ super().__init__(use_codebert=use_codebert)
310
+
311
+
312
+ class UnitTestAlignmentMetric(CodeInsightsFunctionalCorrectnessMetric):
313
+ """Metric for evaluating C++ code generation by comparing unit test results with student correctness pattern."""
314
+
315
+ def _calculate_alignment_metrics(self, llm_pattern: List[int], student_pattern: List[int]) -> List[Stat]:
316
+ """
317
+ Calculate alignment metrics between LLM and student correctness patterns.
318
+ """
319
+ # Ensure patterns have same length (pad with 0s if needed)
320
+ max_length = max(len(llm_pattern), len(student_pattern))
321
+ llm_padded = llm_pattern + [0] * (max_length - len(llm_pattern))
322
+ student_padded = student_pattern + [0] * (max_length - len(student_pattern))
323
+
324
+ # Calculate alignment metrics
325
+ total_tests = max_length
326
+ exact_matches = sum(1 for i in range(total_tests) if llm_padded[i] == student_padded[i])
327
+
328
+ # Alignment ratio (percentage of matching tests)
329
+ alignment_ratio = exact_matches / total_tests if total_tests > 0 else 0.0
330
+
331
+ # Calculate LLM and student pass rates
332
+ llm_pass_rate = sum(llm_padded) / total_tests if total_tests > 0 else 0.0
333
+ student_pass_rate = sum(student_padded) / total_tests if total_tests > 0 else 0.0
334
+
335
+ print(f"Alignment calculation: {exact_matches}/{total_tests} = {alignment_ratio}")
336
+
337
+ return [
338
+ Stat(MetricName("unittest_alignment_ratio")).add(alignment_ratio),
339
+ Stat(MetricName("unittest_llm_pass_rate")).add(llm_pass_rate),
340
+ Stat(MetricName("unittest_student_pass_rate")).add(student_pass_rate),
341
+ ]
342
+
343
+ def evaluate_generation(
344
+ self,
345
+ adapter_spec: AdapterSpec,
346
+ request_state: RequestState,
347
+ metric_service: MetricService,
348
+ eval_cache_path: str,
349
+ ) -> List[Stat]:
350
+ """
351
+ Evaluate LLM-generated code by running unit tests and computing pass rate.
352
+
353
+ Returns:
354
+ List of Stat objects containing the functional correctness score
355
+ """
356
+ print("\n=== FUNCTIONAL CORRECTNESS METRIC DEBUG ===")
357
+ print(f"Instance ID: {getattr(request_state.instance, 'id', 'UNKNOWN')}")
358
+
359
+ # Get the generated code from the request state
360
+ if not request_state.result or not request_state.result.completions:
361
+ print("ERROR: No output generated")
362
+ return self._create_failure_stats("No output generated")
363
+
364
+ generated_code = request_state.result.completions[0].text.strip()
365
+ generated_code = self._extract_student_code(generated_code)
366
+ print(f"Generated code length: {len(generated_code)}")
367
+ print(f"Generated code preview: {generated_code[:200]}...")
368
+
369
+ # Get test cases from instance extra_data
370
+ if not hasattr(request_state.instance, "extra_data") or not request_state.instance.extra_data:
371
+ print("ERROR: No extra_data available")
372
+ print(f"Instance attributes: {dir(request_state.instance)}")
373
+ return self._create_failure_stats("No test data available")
374
+
375
+ extra_data = request_state.instance.extra_data
376
+ print(f"Extra data keys: {list(extra_data.keys())}")
377
+
378
+ test_cases = extra_data.get("test_cases", [])
379
+ student_correctness_pattern = extra_data.get("student_correctness_pattern", [])
380
+ question_template = extra_data.get("question_template", "")
381
+ question_name = extra_data.get("question_name", "UNKNOWN")
382
+
383
+ print(f"Question name: {question_name}")
384
+ print(f"Number of test cases: {len(test_cases)}")
385
+ print(f"Template length: {len(question_template)}")
386
+
387
+ if not test_cases:
388
+ print("ERROR: No test cases available")
389
+ return self._create_failure_stats("No test cases available")
390
+
391
+ print(f"First test case preview: {test_cases[0] if test_cases else 'NONE'}")
392
+
393
+ # Run unit tests and calculate pass rate
394
+ evaluator = CPPEvaluator(question_template, test_cases, timeout=self.timeout, max_workers=self.max_workers)
395
+ llm_correctness_pattern = evaluator.evaluate(generated_code)["testcases"]
396
+ print(f"LLM correctness pattern: {llm_correctness_pattern}")
397
+
398
+ # Compare patterns and calculate alignment metrics
399
+ alignment_stats = self._calculate_alignment_metrics(llm_correctness_pattern, student_correctness_pattern)
400
+
401
+ print(f"Final alignment stats: {[stat.name.name for stat in alignment_stats]}")
402
+ print("=== END UNIT TEST ALIGNMENT DEBUG ===\n")
403
+
404
+ return alignment_stats
405
+
406
+ def _create_failure_stats(self, error_message: str) -> List[Stat]:
407
+ """Create default statistics for failure cases."""
408
+ print(f"UNIT TEST ALIGNMENT METRIC FAILURE: {error_message}")
409
+ return [
410
+ Stat(MetricName("unittest_alignment_ratio")).add(0.0),
411
+ Stat(MetricName("unittest_llm_pass_rate")).add(0.0),
412
+ Stat(MetricName("unittest_student_pass_rate")).add(0.0),
413
+ ]
414
+
415
+
416
+ class CodeInsightsComprehensiveCodeEvaluationMetric(CodeInsightsCodeEvaluationMetric):
417
+ """Comprehensive metric combining AST, CodeBERT, and unit test alignment."""
418
+
419
+ def __init__(self, use_codebert: bool = True):
420
+ super().__init__(use_codebert=use_codebert)
421
+ self.unittest_metric = UnitTestAlignmentMetric()
422
+
423
+ def evaluate_generation(
424
+ self,
425
+ adapter_spec: AdapterSpec,
426
+ request_state: RequestState,
427
+ metric_service: MetricService,
428
+ eval_cache_path: str,
429
+ ) -> List[Stat]:
430
+ """Evaluate with AST, CodeBERT, and unit test alignment metrics."""
431
+
432
+ # Get base AST and CodeBERT metrics
433
+ stats = super().evaluate_generation(adapter_spec, request_state, metric_service, eval_cache_path)
434
+
435
+ # Add unit test alignment metrics
436
+ unittest_stats = self.unittest_metric.evaluate_generation(
437
+ adapter_spec, request_state, metric_service, eval_cache_path
438
+ )
439
+ stats.extend(unittest_stats)
440
+ return stats
441
+
442
+
443
+ # Legacy method for batch evaluation (if needed for backward compatibility)
444
+ def evaluate_ast_distances_batch(results: Dict, analyzer: ASTAnalyzer) -> pd.DataFrame:
445
+ """
446
+ Legacy batch evaluation method for AST distances.
447
+ This can be used outside of HELM if needed.
448
+ """
449
+ all_rows = []
450
+
451
+ for student_id, info in results.items():
452
+ ground_truth = info["ground_truth"]
453
+
454
+ for model_name, generated_code in info["outputs"].items():
455
+ if "Error:" in generated_code:
456
+ normalized_distance = 1.0 # Maximum distance for errors
457
+ else:
458
+ try:
459
+ normalized_distance = analyzer.calculate_ast_distance(generated_code, ground_truth)
460
+ except Exception:
461
+ normalized_distance = 1.0
462
+
463
+ all_rows.append(
464
+ {
465
+ "student_id": student_id,
466
+ "question_id": info["question_id"],
467
+ "model": model_name,
468
+ "normalized_distance": normalized_distance,
469
+ }
470
+ )
471
+
472
+ df_long = pd.DataFrame(all_rows)
473
+ df_ast = df_long.pivot(
474
+ index=["student_id", "question_id"], columns="model", values="normalized_distance"
475
+ ).reset_index()
476
+
477
+ return df_ast