crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  13. helm/benchmark/annotation/model_as_judge.py +12 -16
  14. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  15. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  16. helm/benchmark/executor.py +11 -12
  17. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  18. helm/benchmark/metrics/bias_word_lists.py +1 -1
  19. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  20. helm/benchmark/metrics/classification_metrics.py +3 -3
  21. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  22. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  23. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  24. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  25. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  26. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  27. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  28. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  29. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  30. helm/benchmark/metrics/medalign_metrics.py +9 -29
  31. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  32. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  33. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  34. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  35. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  36. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  37. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  38. helm/benchmark/metrics/metric_service.py +11 -11
  39. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  40. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  41. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  42. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  43. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  44. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  45. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  46. helm/benchmark/metrics/summac/model_summac.py +1 -2
  47. helm/benchmark/metrics/summarization_metrics.py +2 -1
  48. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  49. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  50. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  51. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  52. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  53. helm/benchmark/model_deployment_registry.py +6 -8
  54. helm/benchmark/presentation/contamination.py +3 -3
  55. helm/benchmark/presentation/create_plots.py +33 -12
  56. helm/benchmark/presentation/run_display.py +13 -0
  57. helm/benchmark/presentation/schema.py +2 -1
  58. helm/benchmark/presentation/summarize.py +76 -59
  59. helm/benchmark/reeval_run.py +3 -4
  60. helm/benchmark/reeval_runner.py +3 -3
  61. helm/benchmark/run.py +78 -73
  62. helm/benchmark/run_expander.py +12 -1
  63. helm/benchmark/run_spec_factory.py +7 -6
  64. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  65. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  66. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  67. helm/benchmark/run_specs/long_context_run_specs.py +67 -15
  68. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  69. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  70. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  71. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  72. helm/benchmark/runner.py +5 -5
  73. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  74. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  75. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  76. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  77. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  78. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  79. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  80. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  81. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  82. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  83. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  84. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  85. helm/benchmark/scenarios/clear_scenario.py +11 -7
  86. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  87. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  88. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  89. helm/benchmark/scenarios/grammar.py +2 -2
  90. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  91. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  92. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  93. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  94. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  95. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  96. helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
  97. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  98. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  99. helm/benchmark/scenarios/medec_scenario.py +6 -1
  100. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  101. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  102. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  103. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  104. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  105. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  106. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  107. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  108. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  109. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  110. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  111. helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
  112. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  113. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  114. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  115. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  116. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  117. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  118. helm/benchmark/scenarios/numeracy_scenario.py +2 -1
  119. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  120. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  121. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  122. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  123. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  124. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  125. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  126. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  127. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  128. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  129. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  130. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  131. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  132. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  133. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  134. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  135. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  136. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  137. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  138. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  139. helm/benchmark/server.py +2 -1
  140. helm/benchmark/static/schema_audio.yaml +60 -49
  141. helm/benchmark/static/schema_enterprise.yaml +21 -0
  142. helm/benchmark/static/schema_long_context.yaml +63 -20
  143. helm/benchmark/static/schema_medhelm.yaml +272 -213
  144. helm/benchmark/static/schema_melt.yaml +1257 -0
  145. helm/benchmark/static/schema_slphelm.yaml +162 -0
  146. helm/benchmark/static/schema_vhelm.yaml +26 -26
  147. helm/benchmark/static/schema_video.yaml +219 -0
  148. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  149. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  150. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  151. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  152. helm/benchmark/static_build/index.html +4 -4
  153. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  154. helm/benchmark/window_services/test_utils.py +3 -4
  155. helm/benchmark/window_services/tokenizer_service.py +7 -8
  156. helm/clients/anthropic_client.py +69 -29
  157. helm/clients/audio_language/diva_llama_client.py +4 -2
  158. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  159. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  160. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  161. helm/clients/audio_language/test.py +62 -0
  162. helm/clients/bedrock_client.py +3 -1
  163. helm/clients/client.py +7 -7
  164. helm/clients/grok_client.py +36 -0
  165. helm/clients/huggingface_client.py +42 -3
  166. helm/clients/huggingface_pipeline_client.py +138 -0
  167. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  168. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  169. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  170. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  171. helm/clients/openai_client.py +100 -54
  172. helm/clients/openai_responses_client.py +174 -0
  173. helm/clients/palmyra_client.py +2 -5
  174. helm/clients/reka_client.py +2 -2
  175. helm/clients/together_client.py +31 -4
  176. helm/clients/vertexai_client.py +6 -0
  177. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  178. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  179. helm/clients/vision_language/idefics_client.py +6 -2
  180. helm/clients/vision_language/paligemma_client.py +2 -2
  181. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  182. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  183. helm/clients/writer_client.py +102 -0
  184. helm/common/context.py +80 -0
  185. helm/common/credentials_utils.py +5 -5
  186. helm/common/general.py +9 -2
  187. helm/common/hierarchical_logger.py +46 -3
  188. helm/common/local_context.py +140 -0
  189. helm/common/remote_context.py +61 -0
  190. helm/common/request.py +8 -0
  191. helm/config/model_deployments.yaml +864 -193
  192. helm/config/model_metadata.yaml +667 -53
  193. helm/config/tokenizer_configs.yaml +144 -3
  194. helm/proxy/cli.py +3 -1
  195. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  196. helm/proxy/services/server_service.py +21 -85
  197. helm/tokenizers/grok_tokenizer.py +53 -0
  198. helm/tokenizers/huggingface_tokenizer.py +1 -1
  199. helm/tokenizers/test_grok_tokenizer.py +33 -0
  200. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  201. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  202. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  203. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  204. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
  205. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
  206. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -10,7 +10,7 @@ from helm.benchmark.adaptation.request_state import RequestState
10
10
  from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS, Reference
11
11
  from helm.common.general import parallel_map
12
12
  from helm.common.request import Request
13
- from helm.common.hierarchical_logger import hlog, htrack, htrack_block
13
+ from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn
14
14
  from helm.benchmark.adaptation.adapters.adapter import Adapter
15
15
 
16
16
 
@@ -39,8 +39,8 @@ class InContextLearningAdapter(Adapter, ABC):
39
39
  # Pick out training instances
40
40
  all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
41
41
  if len(all_train_instances) < self.adapter_spec.max_train_instances:
42
- hlog(
43
- f"WARNING: only {len(all_train_instances)} training instances, "
42
+ hwarn(
43
+ f"only {len(all_train_instances)} training instances, "
44
44
  f"wanted {self.adapter_spec.max_train_instances}"
45
45
  )
46
46
 
@@ -1,9 +1,9 @@
1
1
  import shutil
2
2
  import tempfile
3
3
 
4
- from helm.common.authentication import Authentication
4
+
5
5
  from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
6
- from helm.proxy.services.server_service import ServerService
6
+ from helm.common.local_context import LocalContext
7
7
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
8
8
 
9
9
 
@@ -14,8 +14,8 @@ class TestAdapter:
14
14
 
15
15
  def setup_method(self):
16
16
  self.path: str = tempfile.mkdtemp()
17
- service = ServerService(base_path=self.path, root_mode=True, cache_backend_config=BlackHoleCacheBackendConfig())
18
- self.tokenizer_service = TokenizerService(service, Authentication("test"))
17
+ context = LocalContext(base_path=self.path, cache_backend_config=BlackHoleCacheBackendConfig())
18
+ self.tokenizer_service = TokenizerService(context)
19
19
 
20
20
  def teardown_method(self, _):
21
21
  shutil.rmtree(self.path)
@@ -47,7 +47,7 @@ class AIRBench2024Annotator(Annotator):
47
47
  model_input_text = request_state.request.prompt
48
48
  model_output_text = request_state.result.completions[0].text
49
49
  if not model_output_text.strip():
50
- return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
50
+ return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 1.0}
51
51
  category_id = request_state.instance.references[0].output.text
52
52
  prompt_template = self._category_id_to_judge_prompt[category_id]
53
53
  # Strip to deal with incorrectly formatted input CSV.
@@ -9,7 +9,7 @@ from retrying import retry
9
9
 
10
10
  from helm.benchmark.adaptation.request_state import RequestState
11
11
  from helm.benchmark.annotation.annotator import Annotator
12
- from helm.common.hierarchical_logger import hlog
12
+ from helm.common.hierarchical_logger import hlog, hwarn
13
13
 
14
14
 
15
15
  # Based on https://github.com/bigcode-project/bigcodebench/blob/0331489b29cbf2653b4669597ef431e158882aab/bigcodebench/syncheck.py#L14 # noqa: E501
@@ -60,8 +60,8 @@ class BigCodeBenchAnnotator(Annotator):
60
60
  hlog(f"BigCodeBenchAnnotator will use the configured endpoint {endpoint}")
61
61
  self.client = Client(endpoint, hf_token=api_key)
62
62
  else:
63
- hlog(
64
- f"WARNING: BigCodeBenchAnnotator will use the default public evaluator endpoint {self.DEFAULT_URL} - "
63
+ hwarn(
64
+ f"BigCodeBenchAnnotator will use the default public evaluator endpoint {self.DEFAULT_URL} - "
65
65
  "set bigcodebenchApiKey and bigcodebenchEndpoint in credentials.conf to use a cloned evaluator instead"
66
66
  )
67
67
  self.client = Client(self.DEFAULT_URL)
@@ -6,7 +6,7 @@ import sqlite3
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.annotation.annotator import Annotator
8
8
  from helm.benchmark.runner import get_benchmark_output_path
9
- from helm.common.hierarchical_logger import hlog
9
+ from helm.common.hierarchical_logger import hwarn
10
10
 
11
11
 
12
12
  class BirdSQLAnnotator(Annotator):
@@ -34,7 +34,7 @@ class BirdSQLAnnotator(Annotator):
34
34
  cursor.execute(ground_truth_sql)
35
35
  ground_truth_result = cursor.fetchall()
36
36
  except (sqlite3.OperationalError, sqlite3.Warning) as e:
37
- hlog(f"WARNING: Ground truth SQL failed with error: {e}")
37
+ hwarn(f"Ground truth SQL failed with error: {e}")
38
38
 
39
39
  assert request_state.result is not None
40
40
  assert len(request_state.result.completions) == 1
@@ -5,9 +5,9 @@ from helm.clients.auto_client import AutoClient
5
5
 
6
6
 
7
7
  PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
8
- generated response of a clinical scenario.
9
- Your goal is to assess how well the generated response captures the necessary information and
10
- how it compares to the gold response in terms of accuracy, completeness, and clarity.
8
+ generated clinical note given unstructured clinical text.
9
+ Your goal is to assess how well the generated response captures the necessary information
10
+ and follows provided instructions in terms of accuracy, structure, and clarity.
11
11
 
12
12
  The user's request will be provided in these tags:
13
13
  <user_request>
@@ -19,11 +19,6 @@ The response will be provided in these tags:
19
19
  {{RESPONSE}}
20
20
  </response>
21
21
 
22
- The reference response will be provided in these tags:
23
- <gold_response>
24
- {{GOLD_RESPONSE}}
25
- </gold_response>
26
-
27
22
  Carefully analyze the <response>.
28
23
  For each of the following categories, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
29
24
  and provide a short justification for your score.
@@ -32,8 +27,8 @@ Evaluation Criteria:
32
27
  Accuracy (1-5)
33
28
  - Is all the information in the response factually correct?
34
29
 
35
- Completeness (1-5)
36
- - Does the response include all necessary information from the gold response?
30
+ Structure (1-5)
31
+ - Does the response contain all parts for the provided note generation structure?
37
32
 
38
33
  Clarity (1-5)
39
34
  - Is the response easy to understand for a clinician?
@@ -45,7 +40,7 @@ Output the evaluation as a single valid JSON object matching the following struc
45
40
  "score": 0,
46
41
  "explanation": "Explain why this score was given."
47
42
  },
48
- "completeness": {
43
+ "structure": {
49
44
  "score": 0,
50
45
  "explanation": "Explain why this score was given."
51
46
  },
@@ -64,7 +59,7 @@ Ensure the output is valid JSON:
64
59
 
65
60
  ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
66
61
  "accuracy": {"score", "explanation"},
67
- "completeness": {"score", "explanation"},
62
+ "structure": {"score", "explanation"},
68
63
  "clarity": {"score", "explanation"},
69
64
  }
70
65
 
@@ -4,7 +4,7 @@ import re
4
4
  import sqlite3
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
6
  from helm.benchmark.annotation.annotator import Annotator
7
- from helm.common.hierarchical_logger import hlog
7
+ from helm.common.hierarchical_logger import hwarn
8
8
  from helm.benchmark.runner import get_benchmark_output_path
9
9
 
10
10
 
@@ -32,7 +32,7 @@ class EhrSqlAnnotator(Annotator):
32
32
  cursor.execute(ground_truth_sql)
33
33
  ground_truth_result = cursor.fetchall()
34
34
  except (sqlite3.OperationalError, sqlite3.Warning) as e:
35
- hlog(f"WARNING: Ground truth SQL failed with error: {e}")
35
+ hwarn(f"Ground truth SQL failed with error: {e}")
36
36
 
37
37
  # If ground truth SQL execution didn't return results, attempt to use extra_data["value"]
38
38
  if not ground_truth_result and request_state.instance.extra_data is not None:
@@ -5,7 +5,7 @@ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
6
  from helm.benchmark.annotation.annotator import Annotator
7
7
  from helm.clients.auto_client import AutoClient
8
- from helm.common.hierarchical_logger import hlog
8
+ from helm.common.hierarchical_logger import hwarn
9
9
  from helm.common.request import Request
10
10
  from helm.proxy.retry import NonRetriableException
11
11
 
@@ -107,8 +107,8 @@ Please respond with your output and reasoning in the following format, your reas
107
107
  if reasoning_match:
108
108
  reasoning = reasoning_match.group(1).strip()
109
109
  else:
110
- hlog(
111
- "WARNING: HelpdeskCallSummarizationAnnotator could not get Reasoning from annotation from "
110
+ hwarn(
111
+ "HelpdeskCallSummarizationAnnotator could not get Reasoning from annotation from "
112
112
  f"{annotator_model_info.model_name}: {annotator_response_text}"
113
113
  )
114
114
 
@@ -116,13 +116,13 @@ Please respond with your output and reasoning in the following format, your reas
116
116
  try:
117
117
  score = float(score_match.group(1).strip())
118
118
  except ValueError:
119
- hlog(
120
- "WARNING: HelpdeskCallSummarizationAnnotator could not parse Score from annotation from "
119
+ hwarn(
120
+ "HelpdeskCallSummarizationAnnotator could not parse Score from annotation from "
121
121
  f"{annotator_model_info.model_name}: {annotator_response_text}"
122
122
  )
123
123
  else:
124
- hlog(
125
- "WARNING: HelpdeskCallSummarizationAnnotator could not get Score from annotation from "
124
+ hwarn(
125
+ "HelpdeskCallSummarizationAnnotator could not get Score from annotation from "
126
126
  f"{annotator_model_info.model_name}: {annotator_response_text}"
127
127
  )
128
128
 
@@ -0,0 +1,100 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are tasked with evaluating the quality of the generated brief hospital
8
+ course based on the provided clinical note.
9
+ Your goal is to assess how well the brief hospital course captures all the clinical details and
10
+ compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+
13
+ The user's request will be provided in these tags:
14
+ <user_request>
15
+ {{QUESTION}}
16
+ </user_request>
17
+
18
+ The response will be provided in these tags:
19
+ <response>
20
+ {{RESPONSE}}
21
+ </response>
22
+
23
+ A potential correct response will be provided in these tags:
24
+ <gold_response>
25
+ {{GOLD_RESPONSE}}
26
+ </gold_response>
27
+
28
+ Carefully analyze the <response>. For each of the following categories,
29
+ rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
30
+ and provide a short justification for your score.
31
+
32
+ Your evaluation should focus on the following criteria:
33
+ Accuracy (1-5)
34
+ - Does the brief hospital course correctly reflect the key details from the clinical note?
35
+
36
+ Completeness (1-5)
37
+ - Does the brief hospital course include all important details and address the clinical scenario?
38
+
39
+ Clarity (1-5)
40
+ -Is the brief hospital course easy for clinicians to understand?
41
+
42
+
43
+ Output Format:
44
+ Output the evaluation as a single valid JSON object matching the following structure:
45
+ {
46
+ "accuracy": {
47
+ "score": 0,
48
+ "explanation": "Explain why this score was given."
49
+ },
50
+ "completeness": {
51
+ "score": 0,
52
+ "explanation": "Explain why this score was given."
53
+ },
54
+ "clarity": {
55
+ "score": 0,
56
+ "explanation": "Explain why this score was given."
57
+ }
58
+ }
59
+
60
+ Ensure the output is valid JSON:
61
+ - Use **double quotes** (") for all keys and string values.
62
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
63
+ maintain valid JSON formatting.
64
+ - Do not include any additional information in the output.
65
+ """
66
+
67
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
68
+ "accuracy": {"score", "explanation"},
69
+ "completeness": {"score", "explanation"},
70
+ "clarity": {"score", "explanation"},
71
+ }
72
+
73
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
74
+ "gpt": AnnotatorModelInfo(
75
+ model_name="openai/gpt-4o-2024-05-13",
76
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
77
+ ),
78
+ "llama": AnnotatorModelInfo(
79
+ model_name="meta/llama-3.3-70b-instruct",
80
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
81
+ ),
82
+ "claude": AnnotatorModelInfo(
83
+ model_name="anthropic/claude-3-7-sonnet-20250219",
84
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
85
+ ),
86
+ }
87
+
88
+
89
+ class MIMICBHCAnnotator(LLMAsJuryAnnotator):
90
+ """The MIMICBHC autograder."""
91
+
92
+ name = "mimic_bhc"
93
+
94
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
95
+ super().__init__(
96
+ auto_client=auto_client,
97
+ prompt_template=PROMPT_TEMPLATE,
98
+ annotation_criteria=ANNOTATION_CRITERIA,
99
+ annotator_models=ANNOTATOR_MODELS,
100
+ )
@@ -6,7 +6,7 @@ from typing import Dict, Optional, TypedDict, Union, Callable, Any, Set
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.annotation.annotator import Annotator
8
8
  from helm.clients.auto_client import AutoClient
9
- from helm.common.hierarchical_logger import hlog
9
+ from helm.common.hierarchical_logger import hlog, hwarn
10
10
  from helm.common.request import Request
11
11
 
12
12
 
@@ -184,16 +184,13 @@ class LLMAsJuryAnnotator(Annotator):
184
184
  """
185
185
  for key, value in self._annotation_criteria.items():
186
186
  if key not in annotator_criteria:
187
- hlog(
188
- f"WARNING: Annotator did not find the expected key "
189
- f"'{key}' in the response from {annotator_name}."
190
- )
187
+ hwarn(f"Annotator did not find the expected key " f"'{key}' in the response from {annotator_name}.")
191
188
  return False
192
189
 
193
190
  for subkey in value:
194
191
  if subkey not in annotator_criteria[key]:
195
- hlog(
196
- f"WARNING: Annotator did not find the expected subkey "
192
+ hwarn(
193
+ f"Annotator did not find the expected subkey "
197
194
  f"'{subkey}' in the response from {annotator_name}."
198
195
  )
199
196
  return False
@@ -212,7 +209,7 @@ class LLMAsJuryAnnotator(Annotator):
212
209
  # Check for empty model output
213
210
  model_output_text = request_state.result.completions[0].text
214
211
  if not model_output_text.strip():
215
- hlog("WARNING: Annotator skipped sending requests because the model response was empty")
212
+ hwarn("Annotator skipped sending requests because the model response was empty")
216
213
  return {
217
214
  "prompt_text": None,
218
215
  "empty_output_equivalence_judgement": False,
@@ -264,7 +261,7 @@ class LLMAsJuryAnnotator(Annotator):
264
261
  annotator_response = self._auto_client.make_request(annotator_request)
265
262
 
266
263
  if not annotator_response.success:
267
- hlog(f"WARNING: Got an error response from {model_info.model_name}: " f"{annotator_response.error}")
264
+ hwarn(f"Got an error response from {model_info.model_name}: " f"{annotator_response.error}")
268
265
  return None
269
266
 
270
267
  try:
@@ -280,17 +277,16 @@ class LLMAsJuryAnnotator(Annotator):
280
277
  try:
281
278
  annotator_criteria = json.loads(annotator_output)
282
279
  except Exception as ex:
283
- hlog(
284
- f"WARNING: Error parsing response from {model_info.model_name} "
280
+ hwarn(
281
+ f"Error parsing response from {model_info.model_name} "
285
282
  f"after adding closing brace: {ex}. "
286
283
  f"Model output: {annotator_output}"
287
284
  )
288
285
  return None
289
286
  else:
290
287
  # For other JSON decoding errors
291
- hlog(
292
- f"WARNING: JSON decoding error from {model_info.model_name}: {e}. "
293
- f"Model output: {annotator_output}"
288
+ hwarn(
289
+ f"JSON decoding error from {model_info.model_name}: {e}. " f"Model output: {annotator_output}"
294
290
  )
295
291
  return None
296
292
 
@@ -301,8 +297,8 @@ class LLMAsJuryAnnotator(Annotator):
301
297
  return annotator_criteria
302
298
 
303
299
  except Exception as e:
304
- hlog(
305
- f"WARNING: Unexpected error processing response from {model_info.model_name}: {e}. "
300
+ hwarn(
301
+ f"Unexpected error processing response from {model_info.model_name}: {e}. "
306
302
  f"Model output: {annotator_output}"
307
303
  )
308
304
  return None
@@ -5,7 +5,7 @@ from helm.benchmark.adaptation.request_state import RequestState
5
5
  from helm.benchmark.annotation.annotator import Annotator
6
6
  from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
7
7
  from helm.clients.auto_client import AutoClient
8
- from helm.common.hierarchical_logger import hlog
8
+ from helm.common.hierarchical_logger import hwarn
9
9
  from helm.common.request import Request
10
10
 
11
11
 
@@ -47,9 +47,8 @@ class OmniMATHAnnotator(Annotator):
47
47
  .replace("{{Solution}}", model_output_text)
48
48
  )
49
49
  if not model_output_text.strip():
50
- hlog(
51
- "WARNING: OmniMATHAnnotator skipped sending requests to annotator models "
52
- "because the model response was empty"
50
+ hwarn(
51
+ "OmniMATHAnnotator skipped sending requests to annotator models " "because the model response was empty"
53
52
  )
54
53
  return {
55
54
  "prompt_text": None,
@@ -85,8 +84,8 @@ class OmniMATHAnnotator(Annotator):
85
84
  )
86
85
  annotator_response = self._auto_client.make_request(annotator_request)
87
86
  if not annotator_response.success:
88
- hlog(
89
- "WARNING: OmniMATHAnnotator got an error response from "
87
+ hwarn(
88
+ "OmniMATHAnnotator got an error response from "
90
89
  f"{annotator_model_info.model_name}: {annotator_response.error}"
91
90
  )
92
91
  else:
@@ -96,16 +95,16 @@ class OmniMATHAnnotator(Annotator):
96
95
  try:
97
96
  student_final_answer = report_parts["Student Final Answer"]
98
97
  except KeyError:
99
- hlog(
100
- "WARNING: OmniMATHAnnotator could not get Student Final Answer from annotation from "
98
+ hwarn(
99
+ "OmniMATHAnnotator could not get Student Final Answer from annotation from "
101
100
  f"{annotator_model_info.model_name}: {annotator_response_text}"
102
101
  )
103
102
 
104
103
  try:
105
104
  justification = report_parts["Justification"].strip().removesuffix("=== report over ===").strip()
106
105
  except KeyError:
107
- hlog(
108
- "WARNING: OmniMATHAnnotator could not get Justification from annotation from "
106
+ hwarn(
107
+ "OmniMATHAnnotator could not get Justification from annotation from "
109
108
  f"{annotator_model_info.model_name}: {annotator_response_text}"
110
109
  )
111
110
 
@@ -116,13 +115,13 @@ class OmniMATHAnnotator(Annotator):
116
115
  elif equivalence_judgement_str == "FALSE":
117
116
  equivalence_judgement = False
118
117
  else:
119
- hlog(
120
- "WARNING: OmniMATHAnnotator got a non-boolean Equivalence Judgement from annotation from "
118
+ hwarn(
119
+ "OmniMATHAnnotator got a non-boolean Equivalence Judgement from annotation from "
121
120
  f"{annotator_model_info.model_name}: {equivalence_judgement_str}"
122
121
  )
123
122
  except KeyError:
124
- hlog(
125
- "WARNING: OmniMATHAnnotator could not get Equivalence Judgement from annotation from "
123
+ hwarn(
124
+ "OmniMATHAnnotator could not get Equivalence Judgement from annotation from "
126
125
  f"{annotator_model_info.model_name}: {annotator_response_text}"
127
126
  )
128
127
 
@@ -7,7 +7,7 @@ from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.annotation.annotator import Annotator
8
8
  from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
9
9
  from helm.clients.auto_client import AutoClient
10
- from helm.common.hierarchical_logger import hlog
10
+ from helm.common.hierarchical_logger import hwarn
11
11
  from helm.common.request import Request
12
12
 
13
13
 
@@ -32,8 +32,8 @@ class WildBenchAnnotator(Annotator):
32
32
  model_output_text = request_state.result.completions[0].text
33
33
  if not model_output_text.strip():
34
34
  # Following https://github.com/allenai/WildBench/blob/d6b8dcaf377d173d031980f97c16e1a82618c03d/src/eval.py
35
- hlog(
36
- "WARNING: WildBenchAnnotator skipped sending requests to annotator models "
35
+ hwarn(
36
+ "WildBenchAnnotator skipped sending requests to annotator models "
37
37
  "because the model response was empty"
38
38
  )
39
39
  return {
@@ -87,8 +87,8 @@ class WildBenchAnnotator(Annotator):
87
87
  score: Optional[float] = None
88
88
  annotator_response = self._auto_client.make_request(annotator_request)
89
89
  if not annotator_response.success:
90
- hlog(
91
- "WARNING: WildBenchAnnotator got an error response from "
90
+ hwarn(
91
+ "WildBenchAnnotator got an error response from "
92
92
  f"{annotator_model_info.model_name}: : {annotator_response.error}"
93
93
  )
94
94
  else:
@@ -96,8 +96,8 @@ class WildBenchAnnotator(Annotator):
96
96
  annotator_response_text = annotator_response.completions[0].text
97
97
  annotator_response_parts = self._pattern.search(annotator_response_text)
98
98
  if not annotator_response_parts:
99
- hlog(
100
- "WARNING: WildBenchAnnotator got a malformed annotation from "
99
+ hwarn(
100
+ "WildBenchAnnotator got a malformed annotation from "
101
101
  f"{annotator_model_info.model_name}: {annotator_response_text}"
102
102
  )
103
103
  else:
@@ -107,8 +107,8 @@ class WildBenchAnnotator(Annotator):
107
107
  try:
108
108
  score = float(score_text)
109
109
  except ValueError:
110
- hlog(
111
- "WARNING: WildBenchAnnotator could not parse the score from the annotation from "
110
+ hwarn(
111
+ "WildBenchAnnotator could not parse the score from the annotation from "
112
112
  f"{annotator_model_info.model_name}: {annotator_response_text}"
113
113
  )
114
114
 
@@ -1,19 +1,19 @@
1
1
  from typing import Optional
2
2
  from dataclasses import dataclass, replace
3
+
4
+ from helm.common.context import Context
5
+ from helm.common.local_context import LocalContext
6
+ from helm.common.remote_context import RemoteContext
3
7
  from helm.common.cache_backend_config import (
4
8
  CacheBackendConfig,
5
9
  BlackHoleCacheBackendConfig,
6
10
  MongoCacheBackendConfig,
7
11
  SqliteCacheBackendConfig,
8
12
  )
9
-
10
13
  from helm.common.general import parallel_map
11
- from helm.common.hierarchical_logger import htrack, hlog
14
+ from helm.common.hierarchical_logger import htrack, hlog, hwarn
12
15
  from helm.common.request import RequestResult, GeneratedOutput
13
16
  from helm.common.authentication import Authentication
14
- from helm.proxy.services.remote_service import RemoteService
15
- from helm.proxy.services.server_service import ServerService
16
- from helm.proxy.services.service import Service
17
17
  from helm.benchmark.adaptation.scenario_state import ScenarioState
18
18
  from helm.benchmark.adaptation.request_state import RequestState
19
19
 
@@ -29,7 +29,7 @@ class ExecutionSpec:
29
29
  """If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959)."""
30
30
 
31
31
  auth: Authentication
32
- """Authentication that will be passed into the local service, if using the local service."""
32
+ """Authentication that will be passed into the remote service, if using the remote context."""
33
33
 
34
34
  local_path: Optional[str]
35
35
  """Path where API credentials and cache is stored.
@@ -75,15 +75,14 @@ class Executor:
75
75
  else:
76
76
  cache_backend_config = BlackHoleCacheBackendConfig()
77
77
 
78
- self.service: Service
78
+ self.context: Context
79
79
  if execution_spec.url:
80
80
  hlog(f"Running using remote API proxy server: {execution_spec.url}")
81
- self.service = RemoteService(execution_spec.url)
81
+ self.context = RemoteContext(execution_spec.url, execution_spec.auth)
82
82
  elif execution_spec.local_path:
83
83
  hlog(f"Running in local mode with base path: {execution_spec.local_path}")
84
- self.service = ServerService(
84
+ self.context = LocalContext(
85
85
  base_path=execution_spec.local_path,
86
- root_mode=True,
87
86
  cache_backend_config=cache_backend_config,
88
87
  )
89
88
  else:
@@ -111,12 +110,12 @@ class Executor:
111
110
 
112
111
  def process(self, state: RequestState) -> RequestState:
113
112
  try:
114
- result: RequestResult = self.service.make_request(self.execution_spec.auth, state.request)
113
+ result: RequestResult = self.context.make_request(state.request)
115
114
  except Exception as e:
116
115
  raise ExecutorError(f"{str(e)} Request: {state.request}") from e
117
116
  if not result.success:
118
117
  if result.error_flags and not result.error_flags.is_fatal:
119
- hlog(f"WARNING: Non-fatal error treated as empty completion: {result.error}")
118
+ hwarn(f"Non-fatal error treated as empty completion: {result.error}")
120
119
  result.completions = [GeneratedOutput(text="", logprob=0, tokens=[])]
121
120
  else:
122
121
  raise ExecutorError(f"{str(result.error)} Request: {state.request}")
@@ -1,34 +1,14 @@
1
- from typing import Any, Dict, List
2
-
3
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
- from helm.benchmark.adaptation.request_state import RequestState
5
1
  from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
6
- from helm.benchmark.metrics.metric import Metric
7
- from helm.benchmark.metrics.metric_name import MetricName
8
- from helm.benchmark.metrics.metric_service import MetricService
9
- from helm.benchmark.metrics.statistic import Stat
2
+ from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
10
3
 
11
4
 
12
- class ACIBenchMetric(Metric):
5
+ class ACIBenchMetric(LLMJuryMetric):
13
6
  """Score metrics for ACIBench."""
14
7
 
15
- def evaluate_generation(
16
- self,
17
- adapter_spec: AdapterSpec,
18
- request_state: RequestState,
19
- metric_service: MetricService,
20
- eval_cache_path: str,
21
- ) -> List[Stat]:
22
- assert request_state.annotations
23
- annotations: Dict[str, Any] = request_state.annotations["aci_bench"]
24
- scores: List[int] = []
25
- score = 0.0
26
- for annotation_key, annotation_dict in annotations.items():
27
- if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
- for val in annotation_dict.values():
29
- scores.append(int(val["score"]))
30
- if scores:
31
- score = sum(scores) / len(scores)
32
- return [
33
- Stat(MetricName("aci_bench_accuracy")).add(score),
34
- ]
8
+ def __init__(self):
9
+ super().__init__(
10
+ metric_name="aci_bench_accuracy",
11
+ scenario_name="aci_bench",
12
+ annotator_models=ANNOTATOR_MODELS,
13
+ default_score=1.0,
14
+ )
@@ -1,4 +1,4 @@
1
- """ Bias words utilised to compute the bias metrics.
1
+ """Bias words utilised to compute the bias metrics.
2
2
 
3
3
  This file includes word lists for the following 4 categories:
4
4
  - Race (Asian, Hispanic, and White): Most common names for each race (Garg et al. 2018)