crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -1,34 +1,14 @@
1
- from typing import Any, Dict, List
2
-
3
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
- from helm.benchmark.adaptation.request_state import RequestState
5
1
  from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
6
- from helm.benchmark.metrics.metric import Metric
7
- from helm.benchmark.metrics.metric_name import MetricName
8
- from helm.benchmark.metrics.metric_service import MetricService
9
- from helm.benchmark.metrics.statistic import Stat
2
+ from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
10
3
 
11
4
 
12
- class CHWCarePlanMetric(Metric):
13
- """Score metrics for CHWCarePla."""
5
+ class CHWCarePlanMetric(LLMJuryMetric):
6
+ """Score metrics for CHWCarePlan."""
14
7
 
15
- def evaluate_generation(
16
- self,
17
- adapter_spec: AdapterSpec,
18
- request_state: RequestState,
19
- metric_service: MetricService,
20
- eval_cache_path: str,
21
- ) -> List[Stat]:
22
- assert request_state.annotations
23
- annotations: Dict[str, Any] = request_state.annotations["chw_care_plan"]
24
- scores: List[int] = []
25
- score = 0.0
26
- for annotation_key, annotation_dict in annotations.items():
27
- if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
- for val in annotation_dict.values():
29
- scores.append(int(val["score"]))
30
- if scores:
31
- score = sum(scores) / len(scores)
32
- return [
33
- Stat(MetricName("chw_care_plan_accuracy")).add(score),
34
- ]
8
+ def __init__(self):
9
+ super().__init__(
10
+ metric_name="chw_care_plan_accuracy",
11
+ scenario_name="chw_care_plan",
12
+ annotator_models=ANNOTATOR_MODELS,
13
+ default_score=1.0,
14
+ )
@@ -9,7 +9,7 @@ from helm.benchmark.metrics.evaluate_reference_metrics import normalize_text
9
9
  from helm.benchmark.metrics.metric import MetricName
10
10
  from helm.benchmark.metrics.statistic import Stat
11
11
  from helm.benchmark.scenarios.scenario import Reference
12
- from helm.common.hierarchical_logger import hlog
12
+ from helm.common.hierarchical_logger import hwarn
13
13
  from helm.common.request import GeneratedOutput
14
14
 
15
15
 
@@ -75,8 +75,8 @@ class ClassificationMetric(EvaluateInstancesMetric):
75
75
  self.delimiter = delimiter
76
76
  self.labels = labels
77
77
  if not self.labels:
78
- hlog(
79
- "WARNING: `labels` were not set on `ClassificationMetric`, "
78
+ hwarn(
79
+ "`labels` were not set on `ClassificationMetric`, "
80
80
  "so they will be inferred from target references. "
81
81
  "It is recommend to explicitly set `labels` on `ClassificationMetric`."
82
82
  )
@@ -7,7 +7,7 @@ import numpy as np
7
7
 
8
8
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
9
9
  from helm.common.request import RequestResult
10
- from helm.common.hierarchical_logger import hlog
10
+ from helm.common.hierarchical_logger import hlog, hwarn
11
11
  from helm.benchmark.adaptation.request_state import RequestState
12
12
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
13
13
  from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
@@ -200,7 +200,7 @@ class CLEVAToxicityMetric(ToxicityMetric):
200
200
  )
201
201
  except PerspectiveAPIClientCredentialsError as e:
202
202
  self._perspective_api_unavailable = True
203
- hlog(f"WARNING: Skipping ToxicityMetrics because Perspective API Client unavailable due to error: {e}")
203
+ hwarn(f"Skipping ToxicityMetrics because Perspective API Client unavailable due to error: {e}")
204
204
  hlog(
205
205
  "To enable ToxicityMetrics, see: https://crfm-helm.readthedocs.io/en/latest/benchmark/#perspective-api"
206
206
  )
@@ -0,0 +1,186 @@
1
+ from typing import List, Tuple, Dict, Any
2
+ import time
3
+
4
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+ from helm.benchmark.metrics.codeinsights_correct_code_metrics import (
10
+ CodeInsightsFunctionalCorrectnessMetric,
11
+ CPPEvaluator,
12
+ )
13
+
14
+
15
+ class CodeInsightsCodeEfficiencyMetric(CodeInsightsFunctionalCorrectnessMetric):
16
+ """
17
+ Comprehensive metric combining functional correctness and runtime efficiency evaluation.
18
+
19
+ This metric first evaluates functional correctness and then measures runtime efficiency
20
+ alignment between LLM-generated code and student reference code when both are correct.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ num_runtime_runs: int = 5,
26
+ timeout_seconds: int = 10,
27
+ ):
28
+ """
29
+ Initializes the CodeInsightsFunctionalCorrectnessMetric.
30
+
31
+ Args:
32
+ timeout (int): Timeout for each test case execution.
33
+ """
34
+ super().__init__()
35
+ self.num_runtime_runs = num_runtime_runs
36
+ self.timeout_seconds = timeout_seconds
37
+
38
+ def evaluate_generation(
39
+ self,
40
+ adapter_spec: AdapterSpec,
41
+ request_state: RequestState,
42
+ metric_service: MetricService,
43
+ eval_cache_path: str,
44
+ ) -> List[Stat]:
45
+ """
46
+ Evaluate LLM-generated code by running unit tests and computing pass rate.
47
+
48
+ Returns:
49
+ List of Stat objects containing the functional correctness score
50
+ """
51
+ print("\n=== FUNCTIONAL CORRECTNESS METRIC DEBUG ===")
52
+ print(f"Instance ID: {getattr(request_state.instance, 'id', 'UNKNOWN')}")
53
+
54
+ # Get the generated code from the request state
55
+ if not request_state.result or not request_state.result.completions:
56
+ print("ERROR: No output generated")
57
+ return self._create_failure_stats("No output generated")
58
+
59
+ generated_code = request_state.result.completions[0].text.strip()
60
+ generated_code = self._extract_student_code(generated_code)
61
+ print(f"Generated code length: {len(generated_code)}")
62
+ print(f"Generated code preview: {generated_code[:200]}...")
63
+
64
+ # Get the student code from the instance references
65
+ student_code = request_state.instance.references[0].output.text.strip()
66
+ print(f"Student code length: {len(student_code)}")
67
+
68
+ # Get test cases from instance extra_data
69
+ if not hasattr(request_state.instance, "extra_data") or not request_state.instance.extra_data:
70
+ print("ERROR: No extra_data available")
71
+ print(f"Instance attributes: {dir(request_state.instance)}")
72
+ return self._create_failure_stats("No test data available")
73
+
74
+ extra_data = request_state.instance.extra_data
75
+ print(f"Extra data keys: {list(extra_data.keys())}")
76
+
77
+ test_cases = extra_data.get("test_cases", [])
78
+ question_template = extra_data.get("question_template", "")
79
+ question_name = extra_data.get("question_name", "UNKNOWN")
80
+
81
+ print(f"Question name: {question_name}")
82
+ print(f"Number of test cases: {len(test_cases)}")
83
+ print(f"Template length: {len(question_template)}")
84
+
85
+ if not test_cases:
86
+ print("ERROR: No test cases available")
87
+ return self._create_failure_stats("No test cases available")
88
+
89
+ print(f"First test case preview: {test_cases[0] if test_cases else 'NONE'}")
90
+
91
+ # Run unit tests and calculate pass rate
92
+ evaluator = CPPEvaluator(
93
+ question_template,
94
+ test_cases,
95
+ timeout=self.timeout_seconds,
96
+ max_workers=1,
97
+ )
98
+
99
+ llm_output, llm_avg_runtime = self._timed_run(evaluator, generated_code, self.num_runtime_runs)
100
+ stu_output, stu_avg_runtime = self._timed_run(evaluator, student_code, self.num_runtime_runs)
101
+
102
+ # Compute functional correctness score
103
+ if not llm_output or "score" not in llm_output:
104
+ stats = [Stat(MetricName("functional_correctness")).add(0.0)]
105
+ else:
106
+ stats = [Stat(MetricName("functional_correctness")).add(llm_output["score"])]
107
+
108
+ # Calculate runtime metrics if we have data for both solutions
109
+ if llm_avg_runtime > 0 and stu_avg_runtime > 0:
110
+ # Runtime ratio (LLM / Student) - values > 1 mean LLM is slower
111
+ runtime_ratio = llm_avg_runtime / stu_avg_runtime if stu_avg_runtime > 0 else float("inf")
112
+
113
+ # Efficiency alignment score (closer to 1.0 is better alignment)
114
+ # Use reciprocal if LLM is faster to normalize the scale
115
+ if runtime_ratio > 1:
116
+ efficiency_alignment = 1.0 / runtime_ratio
117
+ else:
118
+ efficiency_alignment = runtime_ratio
119
+
120
+ print(f"Runtime ratio (LLM/Student): {runtime_ratio:.4f}")
121
+ print(f"Efficiency alignment score: {efficiency_alignment:.4f}")
122
+
123
+ stats.extend(
124
+ [
125
+ Stat(MetricName("runtime_efficiency_ratio")).add(runtime_ratio),
126
+ Stat(MetricName("efficiency_alignment_score")).add(efficiency_alignment),
127
+ ]
128
+ )
129
+
130
+ # Handle cases where only one solution has runtime data
131
+ elif llm_avg_runtime > 0 and stu_avg_runtime <= 0:
132
+ print("Only LLM runtime available - student solution failed to run")
133
+ stats.extend(
134
+ [
135
+ Stat(MetricName("runtime_efficiency_ratio")).add(float("inf")), # LLM runs, student doesn't
136
+ Stat(MetricName("efficiency_alignment_score")).add(0.0), # No alignment possible
137
+ ]
138
+ )
139
+
140
+ elif llm_avg_runtime <= 0 and stu_avg_runtime > 0:
141
+ print("Only student runtime available - LLM solution failed to run")
142
+ stats.extend(
143
+ [
144
+ Stat(MetricName("runtime_efficiency_ratio")).add(0.0), # Student runs, LLM doesn't
145
+ Stat(MetricName("efficiency_alignment_score")).add(0.0), # No alignment possible
146
+ ]
147
+ )
148
+
149
+ else:
150
+ # Neither solution has runtime data
151
+ print("Runtime measurement failed for both solutions")
152
+ stats.extend(
153
+ [
154
+ Stat(MetricName("runtime_efficiency_ratio")).add(0.0),
155
+ Stat(MetricName("efficiency_alignment_score")).add(0.0),
156
+ ]
157
+ )
158
+
159
+ return stats
160
+
161
+ def _timed_run(self, evaluator: CPPEvaluator, code: str, num_runtime_runs: int = 1) -> Tuple[Dict[str, Any], float]:
162
+ list_runtimes: List[float] = []
163
+ last_output: Dict[str, Any] = {}
164
+
165
+ for _ in range(num_runtime_runs):
166
+ start_time = time.perf_counter()
167
+ output = evaluator.evaluate(code)
168
+ passed = sum(output.get("testcases", []))
169
+
170
+ if passed > 0:
171
+ elapsed = time.perf_counter() - start_time
172
+ list_runtimes.append(elapsed / passed)
173
+ last_output = output
174
+ # if passed == 0, we simply skip recording this run
175
+
176
+ avg_runtime = sum(list_runtimes) / len(list_runtimes) if list_runtimes else 0.0
177
+ return last_output, avg_runtime
178
+
179
+ def _create_failure_stats(self, error_message: str) -> List[Stat]:
180
+ """Create default statistics for failure cases."""
181
+ print(f"RUNTIME EFFICIENCY METRIC FAILURE: {error_message}")
182
+ return [
183
+ Stat(MetricName("functional_correctness")).add(0.0),
184
+ Stat(MetricName("runtime_efficiency_ratio")).add(0.0),
185
+ Stat(MetricName("efficiency_alignment_score")).add(0.0),
186
+ ]