crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
1
+ import ast
2
+ import dataclasses
1
3
  from dataclasses import dataclass, field
2
4
  from typing import List, Optional, Dict
3
5
  import dacite
6
+ from inspect import cleandoc
4
7
  import mako.template
5
8
  import yaml
6
9
  import importlib_resources as resources
@@ -17,6 +20,11 @@ SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
17
20
  SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
18
21
 
19
22
 
23
+ _ADAPTER_SPEC_PACKAGE = "helm.benchmark.adaptation"
24
+ _ADAPTER_SPEC_FILENAME = "adapter_spec.py"
25
+ _ADAPTER_SPEC_CLASS_NAME = "AdapterSpec"
26
+
27
+
20
28
  @dataclass(frozen=True)
21
29
  class Field:
22
30
  """
@@ -198,9 +206,6 @@ class RunGroup(Field):
198
206
  class Schema:
199
207
  """Specifies information about what to display on the frontend."""
200
208
 
201
- # Adapter fields (e.g., temperature)
202
- adapter: List[Field]
203
-
204
209
  # Information about each field
205
210
  metrics: List[Field]
206
211
 
@@ -213,6 +218,11 @@ class Schema:
213
218
  # Group the scenarios
214
219
  run_groups: List[RunGroup]
215
220
 
221
+ # Adapter fields (e.g., temperature)
222
+ # Automatically populated from the docstrings in the AdapterSpec class definition.
223
+ # Should not be specified in the user's YAML file.
224
+ adapter: Optional[List[Field]] = None
225
+
216
226
  def __post_init__(self):
217
227
  self.name_to_metric = {metric.name: metric for metric in self.metrics}
218
228
  self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
@@ -220,6 +230,43 @@ class Schema:
220
230
  self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
221
231
 
222
232
 
233
+ def get_adapter_fields() -> List[Field]:
234
+ """Generate the adapter fields from the docstrings in the AdapterSpec class definition."""
235
+ # Unfortunately there is no standard library support for getting docstrings of class fields,
236
+ # so we have to do the parsing outselves. Fortunately, the parsing is quite straightforward.
237
+ adapter_spec_path = resources.files(_ADAPTER_SPEC_PACKAGE).joinpath(_ADAPTER_SPEC_FILENAME)
238
+ with open(adapter_spec_path, "r") as f:
239
+ contents = f.read()
240
+ module_node = ast.parse(contents)
241
+ adapter_spec_node = [
242
+ node
243
+ for node in ast.iter_child_nodes(module_node)
244
+ if isinstance(node, ast.ClassDef) and node.name == _ADAPTER_SPEC_CLASS_NAME
245
+ ][0]
246
+ metadata_fields: List[Field] = []
247
+ field_name: str = ""
248
+ for node in ast.iter_child_nodes(adapter_spec_node):
249
+ if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
250
+ # This node is a field definition.
251
+ # Save the name of the field for later.
252
+ field_name = node.target.id
253
+ else:
254
+ # If this is a docstring that immediately follows a field definition,
255
+ # output an adapter field with the name set to the field definition and
256
+ # the description set to the docstring.
257
+ if (
258
+ field_name
259
+ and isinstance(node, ast.Expr)
260
+ and isinstance(node.value, ast.Constant)
261
+ and isinstance(node.value.value, str)
262
+ ):
263
+ description = cleandoc(node.value.value).replace("\n", " ")
264
+ metadata_fields.append(Field(name=field_name, description=description))
265
+ field_name = ""
266
+
267
+ return metadata_fields
268
+
269
+
223
270
  def get_default_schema_path() -> str:
224
271
  return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
225
272
 
@@ -229,4 +276,7 @@ def read_schema(schema_path: str) -> Schema:
229
276
  hlog(f"Reading schema file {schema_path}...")
230
277
  with open(schema_path, "r") as f:
231
278
  raw = yaml.safe_load(f)
232
- return dacite.from_dict(Schema, raw)
279
+ schema = dacite.from_dict(Schema, raw)
280
+ if schema.adapter:
281
+ hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
282
+ return dataclasses.replace(schema, adapter=get_adapter_fields())
@@ -0,0 +1,11 @@
1
+ from helm.benchmark.presentation.schema import get_adapter_fields
2
+
3
+
4
+ def test_get_adapter_fields() -> None:
5
+ adapter_fields = get_adapter_fields()
6
+ assert adapter_fields
7
+ assert adapter_fields[0].name == "method"
8
+ assert (
9
+ adapter_fields[0].description
10
+ == "The high-level strategy for converting instances into a prompt for the language model."
11
+ )
helm/benchmark/run.py CHANGED
@@ -264,6 +264,13 @@ def main():
264
264
  default=None,
265
265
  help="Full class name of the Runner class to use. If unset, uses the default Runner.",
266
266
  )
267
+ parser.add_argument(
268
+ "--openvino",
269
+ action="store_true",
270
+ default=False,
271
+ help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
272
+ "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
273
+ )
267
274
  add_run_args(parser)
268
275
  args = parser.parse_args()
269
276
  validate_args(args)
@@ -275,12 +282,19 @@ def main():
275
282
  from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
276
283
 
277
284
  for huggingface_model_name in args.enable_huggingface_models:
278
- register_huggingface_hub_model_from_flag_value(huggingface_model_name)
285
+ if args.openvino:
286
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
287
+ else:
288
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name)
289
+
279
290
  if args.enable_local_huggingface_models:
280
291
  from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
281
292
 
282
293
  for huggingface_model_path in args.enable_local_huggingface_models:
283
- register_huggingface_local_model_from_flag_value(huggingface_model_path)
294
+ if args.openvino:
295
+ register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
296
+ else:
297
+ register_huggingface_local_model_from_flag_value(huggingface_model_path)
284
298
 
285
299
  run_entries: List[RunEntry] = []
286
300
  if args.conf_paths:
@@ -8,12 +8,14 @@ from helm.benchmark.model_metadata_registry import (
8
8
  get_all_code_models,
9
9
  get_all_models,
10
10
  get_all_text_models,
11
+ get_model_metadata,
11
12
  get_model_names_with_tag,
12
13
  FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
13
14
  LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
14
15
  ABLATION_MODEL_TAG,
15
16
  TEXT_TO_IMAGE_MODEL_TAG,
16
17
  VISION_LANGUAGE_MODEL_TAG,
18
+ INSTRUCTION_FOLLOWING_MODEL_TAG,
17
19
  )
18
20
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
19
21
  from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
@@ -192,6 +194,15 @@ class StopRunExpander(RunExpander):
192
194
  self.value = value
193
195
 
194
196
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
197
+ if self.value == "none":
198
+ return [
199
+ replace(
200
+ run_spec,
201
+ name=f"{run_spec.name},{self.name}={self.value}",
202
+ adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
203
+ ),
204
+ ]
205
+
195
206
  if self.value == "hash":
196
207
  stop = "###"
197
208
  elif self.value == "semicolon":
@@ -322,6 +333,16 @@ class AnthropicClaude3RunExpander(RunExpander):
322
333
  name = "claude_3"
323
334
 
324
335
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
336
+ # Remove all stop sequences that do not contain non-whitespace characters.
337
+ # This prevents the Anthropic API from returnin the following error:
338
+ # "stop_sequences: each stop sequence must contain non-whitespace"
339
+ stop_sequences_with_non_whitespace = [
340
+ stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
341
+ ]
342
+ run_spec = replace(
343
+ run_spec,
344
+ adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
345
+ )
325
346
  if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
326
347
  instructions = "Answer with only a single letter."
327
348
  if run_spec.adapter_spec.instructions:
@@ -335,78 +356,37 @@ class AnthropicClaude3RunExpander(RunExpander):
335
356
  return [run_spec]
336
357
 
337
358
 
338
- class OpenAIRunExpander(RunExpander):
339
- """
340
- Custom prompt for OpenAI models.
341
- These models need more explicit instructions about following the format.
342
- """
359
+ class FollowFormatInstructionsRunExpander(RunExpander):
360
+ """Adds more explicit instructions about following the format to prompts.
343
361
 
344
- # TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
345
-
346
- name = "openai"
347
-
348
- def __init__(self):
349
- pass
350
-
351
- def expand(self, run_spec: RunSpec) -> List[RunSpec]:
352
- if run_spec.adapter_spec.method != ADAPT_GENERATION:
353
- return [run_spec]
354
-
355
- return [
356
- replace(
357
- run_spec,
358
- name=run_spec.name,
359
- adapter_spec=replace(
360
- run_spec.adapter_spec,
361
- global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
362
- global_suffix="\n\n"
363
- + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
364
- + "\n"
365
- + run_spec.adapter_spec.output_prefix.strip(),
366
- ),
367
- ),
368
- ]
362
+ The argument controlls which models will receive these instructions.
363
+ If "all", all models receive these instructions.
364
+ If "instruct", only instruction-following models receive these instructions.
369
365
 
366
+ Only supports the generation adaptation method. Raises an error if used on
367
+ a RunSpec that uses a different adaptation method.
370
368
 
371
- class GoogleRunExpander(RunExpander):
372
- """
373
- Custom prompt for Google models.
374
- These models need more explicit instructions about following the format.
369
+ Note: For legacy backwards compatibility reasons, despite the use of the word
370
+ "instructions" in this run expander's name, this run expander actually
371
+ modifies the global_prefix and the global_suffix of the AdapterSpec rather than
372
+ the instructions.
375
373
  """
376
374
 
377
- # TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
375
+ name = "follow_format_instructions"
378
376
 
379
- name = "google"
377
+ def __init__(self, value: str):
378
+ if value != "all" and value != "instruct":
379
+ raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
380
+ self.value = value
380
381
 
381
382
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
382
383
  if run_spec.adapter_spec.method != ADAPT_GENERATION:
383
- return [run_spec]
384
-
385
- return [
386
- replace(
387
- run_spec,
388
- name=run_spec.name,
389
- adapter_spec=replace(
390
- run_spec.adapter_spec,
391
- global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
392
- global_suffix="\n\n"
393
- + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
394
- + "\n"
395
- + run_spec.adapter_spec.output_prefix.strip(),
396
- ),
397
- ),
398
- ]
399
-
384
+ raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
400
385
 
401
- class MistralRunExpander(RunExpander):
402
- """Custom prompt for Mistral models."""
403
-
404
- # TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
405
-
406
- name = "output_format_instructions"
407
-
408
- def expand(self, run_spec: RunSpec) -> List[RunSpec]:
409
- if run_spec.adapter_spec.method != ADAPT_GENERATION:
386
+ if (
387
+ self.value == "instruct"
388
+ and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
389
+ ):
410
390
  return [run_spec]
411
391
 
412
392
  return [
@@ -539,7 +519,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
539
519
  "one": [1],
540
520
  "all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
541
521
  "big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
542
- "heim_human_eval": [0, 1, 2, 4, 8],
522
+ "vhelm": [0, 1, 2, 4, 8],
543
523
  }
544
524
 
545
525
 
@@ -1064,6 +1044,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
1064
1044
  "chinese": {"chinese": [translate(language_code="zh-CN")]},
1065
1045
  "hindi": {"hindi": [translate(language_code="hi")]},
1066
1046
  "spanish": {"spanish": [translate(language_code="es")]},
1047
+ "swahili": {"swahili": [translate(language_code="sw")]},
1067
1048
  # Styles
1068
1049
  "art": {
1069
1050
  "art": [
@@ -1409,12 +1390,79 @@ class ChatMLRunExpander(RunExpander):
1409
1390
  ]
1410
1391
 
1411
1392
 
1393
+ class OutputFormatInstructions(RunExpander):
1394
+ """Add extra instructions to about output formatting to HELM Lite scenarios.
1395
+
1396
+ Many instruction-following models and chat models are tuned to expect conversational prompts
1397
+ and respond in a conversational way. These models occasionally produce outputs that are not
1398
+ in the expected format. This run expander instructs these models to provide the output in
1399
+ the format expected by the scenario.
1400
+
1401
+ The argument should be the name of the scenario."""
1402
+
1403
+ name = "output_format_instructions"
1404
+
1405
+ def __init__(self, scenario: str):
1406
+ self.scenario = scenario
1407
+
1408
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
1409
+ if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
1410
+ if self.scenario == "mmlu_only_last_question":
1411
+ instructions = "Answer only the last question with only a single letter."
1412
+ else:
1413
+ instructions = "Answer with only a single letter."
1414
+ if run_spec.adapter_spec.instructions:
1415
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
1416
+ return [
1417
+ replace(
1418
+ run_spec,
1419
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
1420
+ ),
1421
+ ]
1422
+ elif run_spec.adapter_spec.method == ADAPT_GENERATION:
1423
+ output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
1424
+ if self.scenario == "narrative_qa":
1425
+ instructions = (
1426
+ "Answer with one word, a few-word phrase, or a short sentence. "
1427
+ + "Avoid extra, unnecessary information in the answer."
1428
+ )
1429
+ elif self.scenario == "natural_qa":
1430
+ instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
1431
+ elif self.scenario == "legalbench":
1432
+ if output_noun != "Answer":
1433
+ instructions = f"Answer with the {output_noun.lower()}."
1434
+ else:
1435
+ instructions = "Answer yes or no."
1436
+ elif self.scenario == "wmt_14":
1437
+ instructions = "Answer with the English translation."
1438
+ else:
1439
+ raise ValueError(f"Unknown scenario {self.scenario}")
1440
+
1441
+ if run_spec.adapter_spec.output_prefix:
1442
+ instructions = (
1443
+ f"{instructions} Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
1444
+ )
1445
+
1446
+ if run_spec.adapter_spec.instructions:
1447
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
1448
+ else:
1449
+ instructions = f"{instructions}\n"
1450
+ return [
1451
+ replace(
1452
+ run_spec,
1453
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
1454
+ ),
1455
+ ]
1456
+ raise ValueError(f"Unknown scenario {self.scenario}")
1457
+
1458
+
1412
1459
  RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1413
1460
  InstructionsRunExpander,
1414
1461
  PromptRunExpander,
1415
1462
  NewlineRunExpander,
1416
1463
  StopRunExpander,
1417
1464
  FormatPromptRunExpander,
1465
+ FollowFormatInstructionsRunExpander,
1418
1466
  AddToStopRunExpander,
1419
1467
  GlobalPrefixRunExpander,
1420
1468
  NumTrainTrialsRunExpander,
@@ -1430,6 +1478,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1430
1478
  NumOutputTokensRunExpander,
1431
1479
  ChatMLRunExpander,
1432
1480
  EvalSplitRunExpander,
1481
+ OutputFormatInstructions,
1433
1482
  ]
1434
1483
 
1435
1484
 
@@ -4,7 +4,6 @@ from typing import List
4
4
  from helm.benchmark.adaptation.adapter_spec import (
5
5
  ADAPT_GENERATION,
6
6
  ADAPT_MULTIPLE_CHOICE_JOINT,
7
- ADAPT_GENERATION_MULTIMODAL,
8
7
  )
9
8
  from helm.benchmark.model_deployment_registry import (
10
9
  ModelDeployment,
@@ -14,22 +13,24 @@ from helm.benchmark.model_deployment_registry import (
14
13
  from helm.benchmark.model_metadata_registry import (
15
14
  ANTHROPIC_CLAUDE_1_MODEL_TAG,
16
15
  ANTHROPIC_CLAUDE_2_MODEL_TAG,
16
+ ANTHROPIC_CLAUDE_3_MODEL_TAG,
17
17
  BUGGY_TEMP_0_TAG,
18
18
  CHATML_MODEL_TAG,
19
- GOOGLE_GEMINI_MODEL_TAG,
19
+ GOOGLE_GEMINI_PRO_VISION_V1_TAG,
20
20
  IDEFICS_INSTRUCT_MODEL_TAG,
21
- IDEFICS_MODEL_TAG,
22
21
  LLAVA_MODEL_TAG,
23
22
  OPEN_FLAMINGO_MODEL_TAG,
24
- VISION_LANGUAGE_MODEL_TAG,
25
23
  NLG_PREFIX_TAG,
26
24
  NO_NEWLINES_TAG,
25
+ VISION_LANGUAGE_MODEL_TAG,
26
+ IDEFICS_MODEL_TAG,
27
27
  ModelMetadata,
28
28
  get_model_metadata,
29
29
  )
30
30
  from helm.benchmark.run_expander import (
31
31
  RUN_EXPANDERS,
32
32
  AnthropicClaude2RunExpander,
33
+ AnthropicClaude3RunExpander,
33
34
  ChatMLRunExpander,
34
35
  GlobalPrefixRunExpander,
35
36
  IDEFICSInstructRunExpander,
@@ -125,20 +126,20 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
125
126
  if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
126
127
  run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
127
128
 
128
- # Google Gemini Vision returns an empty completion or throws an error if max_tokens is 1
129
+ # Anthropic Claude 3
130
+ if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
131
+ run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
132
+
133
+ # Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
129
134
  if (
130
135
  VISION_LANGUAGE_MODEL_TAG in model.tags
131
- and GOOGLE_GEMINI_MODEL_TAG in model.tags
136
+ and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
132
137
  and run_spec.adapter_spec.max_tokens == 1
133
138
  ):
134
139
  run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
135
140
 
136
141
  # IDEFICS special handling
137
142
  if IDEFICS_MODEL_TAG in model.tags:
138
- # IDEFICS requires more `max_tokens` to generate something reasonable for open-ended generation
139
- if run_spec.adapter_spec.method == ADAPT_GENERATION_MULTIMODAL:
140
- run_spec = singleton(IncreaseMaxTokensRunExpander(value=30).expand(run_spec))
141
-
142
143
  if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
143
144
  run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
144
145
 
@@ -155,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
155
156
  increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
156
157
  run_spec = singleton(increase_temperature_expander.expand(run_spec))
157
158
 
159
+ # MedLM-Large
160
+ if run_spec.adapter_spec.model == "google/medlm-large":
161
+ run_spec = singleton(StopRunExpander("none").expand(run_spec))
162
+
158
163
  return run_spec
159
164
 
160
165
  run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
@@ -0,0 +1,40 @@
1
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
2
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
3
+ from helm.benchmark.metrics.metric import MetricSpec
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("air_bench_2024")
9
+ def get_air_bench_2024_spec() -> RunSpec:
10
+ adapter_spec = AdapterSpec(
11
+ method=ADAPT_GENERATION,
12
+ global_prefix="",
13
+ global_suffix="",
14
+ instructions="",
15
+ input_prefix="",
16
+ input_suffix="",
17
+ output_prefix="",
18
+ output_suffix="",
19
+ instance_prefix="",
20
+ max_train_instances=0,
21
+ num_outputs=1,
22
+ max_tokens=512,
23
+ temperature=0.0,
24
+ stop_sequences=[],
25
+ )
26
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
27
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
28
+ metric_specs = [
29
+ MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
30
+ MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
31
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
32
+ ]
33
+ return RunSpec(
34
+ name="air_bench_2024",
35
+ scenario_spec=scenario_spec,
36
+ adapter_spec=adapter_spec,
37
+ metric_specs=metric_specs,
38
+ annotators=annotator_specs,
39
+ groups=["air_bench_2024"],
40
+ )
@@ -24,6 +24,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
24
24
  get_ranking_binary_adapter_spec,
25
25
  get_summarization_adapter_spec,
26
26
  )
27
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
27
28
  from helm.benchmark.metrics.common_metric_specs import (
28
29
  get_basic_metric_specs,
29
30
  get_bias_metric_specs,
@@ -1166,8 +1167,6 @@ def get_pubmed_qa_spec() -> RunSpec:
1166
1167
 
1167
1168
  @run_spec_function("live_qa")
1168
1169
  def get_live_qa_spec() -> RunSpec:
1169
- from helm.common.gpu_utils import get_torch_device_name
1170
-
1171
1170
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
1172
1171
 
1173
1172
  adapter_spec = get_generation_adapter_spec(
@@ -1177,22 +1176,23 @@ def get_live_qa_spec() -> RunSpec:
1177
1176
  max_train_instances=0,
1178
1177
  max_tokens=512,
1179
1178
  )
1179
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
1180
+ metric_specs = get_open_ended_generation_metric_specs() + [
1181
+ MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
1182
+ ]
1180
1183
 
1181
1184
  return RunSpec(
1182
1185
  name="live_qa",
1183
1186
  scenario_spec=scenario_spec,
1184
1187
  adapter_spec=adapter_spec,
1185
- metric_specs=get_summarization_metric_specs(
1186
- {"task": "live_qa", "device": get_torch_device_name()},
1187
- ),
1188
+ annotators=annotator_specs,
1189
+ metric_specs=metric_specs,
1188
1190
  groups=["live_qa"],
1189
1191
  )
1190
1192
 
1191
1193
 
1192
1194
  @run_spec_function("medication_qa")
1193
1195
  def get_medication_qa_spec() -> RunSpec:
1194
- from helm.common.gpu_utils import get_torch_device_name
1195
-
1196
1196
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
1197
1197
 
1198
1198
  adapter_spec = get_generation_adapter_spec(
@@ -1203,13 +1203,17 @@ def get_medication_qa_spec() -> RunSpec:
1203
1203
  max_tokens=512,
1204
1204
  )
1205
1205
 
1206
+ annotator_specs = [
1207
+ AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
1208
+ ]
1209
+ metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
1210
+
1206
1211
  return RunSpec(
1207
1212
  name="medication_qa",
1208
1213
  scenario_spec=scenario_spec,
1209
1214
  adapter_spec=adapter_spec,
1210
- metric_specs=get_summarization_metric_specs(
1211
- {"task": "medication_qa", "device": get_torch_device_name()},
1212
- ),
1215
+ annotators=annotator_specs,
1216
+ metric_specs=metric_specs,
1213
1217
  groups=["medication_qa"],
1214
1218
  )
1215
1219
 
@@ -1506,5 +1510,5 @@ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_J
1506
1510
  scenario_spec=scenario_spec,
1507
1511
  adapter_spec=adapter_spec,
1508
1512
  metric_specs=get_exact_match_metric_specs(),
1509
- groups=["thai_exam"],
1513
+ groups=["thai_exam", f"thai_exam_{exam}"],
1510
1514
  )
@@ -309,6 +309,8 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
309
309
  name="decodingtrust_toxicity_prompts",
310
310
  scenario_spec=scenario_spec,
311
311
  adapter_spec=adapter_spec,
312
- metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
312
+ metric_specs=get_generative_harms_metric_specs(
313
+ include_basic_metrics=True, include_generative_harms_metrics=True
314
+ ),
313
315
  groups=["decodingtrust", "toxicity_prompts"],
314
316
  )
@@ -0,0 +1,33 @@
1
+ """Run specs for experiments only.
2
+
3
+ These run specs are not intended for use with public leaderboards."""
4
+
5
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
6
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
7
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
8
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
9
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
10
+
11
+
12
+ @run_spec_function("ci_mcqa")
13
+ def get_ci_mcqa_spec() -> RunSpec:
14
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ci_mcqa_scenario.CIMCQAScenario", args={})
15
+
16
+ adapter_spec = get_multiple_choice_adapter_spec(
17
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
18
+ instructions=(
19
+ "Give a letter answer among the options given. "
20
+ "For example, if the options are A, B, C, D, E, and F, "
21
+ "your answer should consist of the single letter that corresponds to the correct answer."
22
+ ),
23
+ input_noun="Question",
24
+ output_noun="Answer",
25
+ )
26
+
27
+ return RunSpec(
28
+ name="ci_mcqa",
29
+ scenario_spec=scenario_spec,
30
+ adapter_spec=adapter_spec,
31
+ metric_specs=get_exact_match_metric_specs(),
32
+ groups=["CIMCQA"],
33
+ )
@@ -0,0 +1,33 @@
1
+ """Run spec functions for the HELM Finance leaderboard.
2
+
3
+ Website: https://crfm.stanford.edu/helm/finance/"""
4
+
5
+ from helm.benchmark.adaptation.common_adapter_specs import (
6
+ get_generation_adapter_spec,
7
+ )
8
+ from helm.benchmark.metrics.common_metric_specs import (
9
+ get_basic_metric_specs,
10
+ )
11
+ from helm.benchmark.metrics.metric import MetricSpec
12
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
13
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
14
+
15
+
16
+ @run_spec_function("fin_qa")
17
+ def get_fin_qa_spec() -> RunSpec:
18
+ from helm.benchmark.scenarios.fin_qa_scenario import INSTRUCTIONS
19
+
20
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.fin_qa_scenario.FinQAScenario", args={})
21
+ adapter_spec = get_generation_adapter_spec(
22
+ instructions=INSTRUCTIONS, input_noun=None, output_noun="Program", max_tokens=100
23
+ )
24
+ metric_specs = get_basic_metric_specs([]) + [
25
+ MetricSpec(class_name="helm.benchmark.metrics.fin_qa_metrics.FinQAMetric")
26
+ ]
27
+ return RunSpec(
28
+ name="fin_qa",
29
+ scenario_spec=scenario_spec,
30
+ adapter_spec=adapter_spec,
31
+ metric_specs=metric_specs,
32
+ groups=["fin_qa"],
33
+ )