crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  5. helm/benchmark/annotation/annotator_factory.py +6 -0
  6. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  7. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  8. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  9. helm/benchmark/huggingface_registration.py +16 -6
  10. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  11. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  12. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  13. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  14. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  15. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  16. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  17. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  18. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  19. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  20. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  21. helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
  22. helm/benchmark/presentation/schema.py +54 -4
  23. helm/benchmark/presentation/test_schema.py +11 -0
  24. helm/benchmark/run.py +16 -2
  25. helm/benchmark/run_expander.py +77 -0
  26. helm/benchmark/run_spec_factory.py +4 -0
  27. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  29. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  30. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  31. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  32. helm/benchmark/run_specs/vlm_run_specs.py +168 -45
  33. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  34. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  35. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  36. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  37. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  38. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  39. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  40. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  41. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
  42. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
  43. helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
  44. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  45. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  46. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  47. helm/benchmark/static/schema_classic.yaml +3 -59
  48. helm/benchmark/static/schema_finance.yaml +143 -0
  49. helm/benchmark/static/schema_image2structure.yaml +254 -111
  50. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  51. helm/benchmark/static/schema_lite.yaml +3 -61
  52. helm/benchmark/static/schema_medical.yaml +255 -0
  53. helm/benchmark/static/schema_mmlu.yaml +3 -61
  54. helm/benchmark/static/schema_tables.yaml +200 -0
  55. helm/benchmark/static/schema_thai.yaml +223 -0
  56. helm/benchmark/static/schema_unitxt.yaml +3 -61
  57. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
  58. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  59. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  60. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  61. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  62. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  63. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  64. helm/benchmark/static_build/index.html +2 -2
  65. helm/clients/anthropic_client.py +43 -9
  66. helm/clients/auto_client.py +11 -0
  67. helm/clients/client.py +24 -7
  68. helm/clients/cohere_client.py +98 -3
  69. helm/clients/huggingface_client.py +71 -12
  70. helm/clients/openai_client.py +9 -2
  71. helm/clients/reka_client.py +189 -0
  72. helm/clients/test_client.py +3 -3
  73. helm/clients/test_huggingface_client.py +19 -3
  74. helm/clients/test_together_client.py +72 -2
  75. helm/clients/together_client.py +129 -23
  76. helm/clients/vertexai_client.py +62 -18
  77. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  78. helm/clients/vision_language/paligemma_client.py +146 -0
  79. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  80. helm/clients/yi_client.py +31 -0
  81. helm/common/critique_request.py +10 -1
  82. helm/common/images_utils.py +19 -0
  83. helm/config/model_deployments.yaml +412 -18
  84. helm/config/model_metadata.yaml +447 -25
  85. helm/config/tokenizer_configs.yaml +93 -1
  86. helm/proxy/critique/model_critique_client.py +32 -4
  87. helm/proxy/services/server_service.py +1 -1
  88. helm/tokenizers/auto_tokenizer.py +1 -1
  89. helm/tokenizers/cohere_tokenizer.py +44 -2
  90. helm/tokenizers/huggingface_tokenizer.py +36 -13
  91. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  92. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  93. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  94. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  95. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  96. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  97. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  98. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
helm/benchmark/run.py CHANGED
@@ -264,6 +264,13 @@ def main():
264
264
  default=None,
265
265
  help="Full class name of the Runner class to use. If unset, uses the default Runner.",
266
266
  )
267
+ parser.add_argument(
268
+ "--openvino",
269
+ action="store_true",
270
+ default=False,
271
+ help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
272
+ "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
273
+ )
267
274
  add_run_args(parser)
268
275
  args = parser.parse_args()
269
276
  validate_args(args)
@@ -275,12 +282,19 @@ def main():
275
282
  from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
276
283
 
277
284
  for huggingface_model_name in args.enable_huggingface_models:
278
- register_huggingface_hub_model_from_flag_value(huggingface_model_name)
285
+ if args.openvino:
286
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
287
+ else:
288
+ register_huggingface_hub_model_from_flag_value(huggingface_model_name)
289
+
279
290
  if args.enable_local_huggingface_models:
280
291
  from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
281
292
 
282
293
  for huggingface_model_path in args.enable_local_huggingface_models:
283
- register_huggingface_local_model_from_flag_value(huggingface_model_path)
294
+ if args.openvino:
295
+ register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
296
+ else:
297
+ register_huggingface_local_model_from_flag_value(huggingface_model_path)
284
298
 
285
299
  run_entries: List[RunEntry] = []
286
300
  if args.conf_paths:
@@ -194,6 +194,15 @@ class StopRunExpander(RunExpander):
194
194
  self.value = value
195
195
 
196
196
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
197
+ if self.value == "none":
198
+ return [
199
+ replace(
200
+ run_spec,
201
+ name=f"{run_spec.name},{self.name}={self.value}",
202
+ adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
203
+ ),
204
+ ]
205
+
197
206
  if self.value == "hash":
198
207
  stop = "###"
199
208
  elif self.value == "semicolon":
@@ -1035,6 +1044,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
1035
1044
  "chinese": {"chinese": [translate(language_code="zh-CN")]},
1036
1045
  "hindi": {"hindi": [translate(language_code="hi")]},
1037
1046
  "spanish": {"spanish": [translate(language_code="es")]},
1047
+ "swahili": {"swahili": [translate(language_code="sw")]},
1038
1048
  # Styles
1039
1049
  "art": {
1040
1050
  "art": [
@@ -1380,6 +1390,72 @@ class ChatMLRunExpander(RunExpander):
1380
1390
  ]
1381
1391
 
1382
1392
 
1393
+ class OutputFormatInstructions(RunExpander):
1394
+ """Add extra instructions to about output formatting to HELM Lite scenarios.
1395
+
1396
+ Many instruction-following models and chat models are tuned to expect conversational prompts
1397
+ and respond in a conversational way. These models occasionally produce outputs that are not
1398
+ in the expected format. This run expander instructs these models to provide the output in
1399
+ the format expected by the scenario.
1400
+
1401
+ The argument should be the name of the scenario."""
1402
+
1403
+ name = "output_format_instructions"
1404
+
1405
+ def __init__(self, scenario: str):
1406
+ self.scenario = scenario
1407
+
1408
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
1409
+ if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
1410
+ if self.scenario == "mmlu_only_last_question":
1411
+ instructions = "Answer only the last question with only a single letter."
1412
+ else:
1413
+ instructions = "Answer with only a single letter."
1414
+ if run_spec.adapter_spec.instructions:
1415
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
1416
+ return [
1417
+ replace(
1418
+ run_spec,
1419
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
1420
+ ),
1421
+ ]
1422
+ elif run_spec.adapter_spec.method == ADAPT_GENERATION:
1423
+ output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
1424
+ if self.scenario == "narrative_qa":
1425
+ instructions = (
1426
+ "Answer with one word, a few-word phrase, or a short sentence. "
1427
+ + "Avoid extra, unnecessary information in the answer."
1428
+ )
1429
+ elif self.scenario == "natural_qa":
1430
+ instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
1431
+ elif self.scenario == "legalbench":
1432
+ if output_noun != "Answer":
1433
+ instructions = f"Answer with the {output_noun.lower()}."
1434
+ else:
1435
+ instructions = "Answer yes or no."
1436
+ elif self.scenario == "wmt_14":
1437
+ instructions = "Answer with the English translation."
1438
+ else:
1439
+ raise ValueError(f"Unknown scenario {self.scenario}")
1440
+
1441
+ if run_spec.adapter_spec.output_prefix:
1442
+ instructions = (
1443
+ f"{instructions} Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
1444
+ )
1445
+
1446
+ if run_spec.adapter_spec.instructions:
1447
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
1448
+ else:
1449
+ instructions = f"{instructions}\n"
1450
+ return [
1451
+ replace(
1452
+ run_spec,
1453
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
1454
+ ),
1455
+ ]
1456
+ raise ValueError(f"Unknown scenario {self.scenario}")
1457
+
1458
+
1383
1459
  RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1384
1460
  InstructionsRunExpander,
1385
1461
  PromptRunExpander,
@@ -1402,6 +1478,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1402
1478
  NumOutputTokensRunExpander,
1403
1479
  ChatMLRunExpander,
1404
1480
  EvalSplitRunExpander,
1481
+ OutputFormatInstructions,
1405
1482
  ]
1406
1483
 
1407
1484
 
@@ -156,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
156
156
  increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
157
157
  run_spec = singleton(increase_temperature_expander.expand(run_spec))
158
158
 
159
+ # MedLM-Large
160
+ if run_spec.adapter_spec.model == "google/medlm-large":
161
+ run_spec = singleton(StopRunExpander("none").expand(run_spec))
162
+
159
163
  return run_spec
160
164
 
161
165
  run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
@@ -0,0 +1,40 @@
1
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
2
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
3
+ from helm.benchmark.metrics.metric import MetricSpec
4
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
5
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
6
+
7
+
8
+ @run_spec_function("air_bench_2024")
9
+ def get_air_bench_2024_spec() -> RunSpec:
10
+ adapter_spec = AdapterSpec(
11
+ method=ADAPT_GENERATION,
12
+ global_prefix="",
13
+ global_suffix="",
14
+ instructions="",
15
+ input_prefix="",
16
+ input_suffix="",
17
+ output_prefix="",
18
+ output_suffix="",
19
+ instance_prefix="",
20
+ max_train_instances=0,
21
+ num_outputs=1,
22
+ max_tokens=512,
23
+ temperature=0.0,
24
+ stop_sequences=[],
25
+ )
26
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
27
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
28
+ metric_specs = [
29
+ MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
30
+ MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
31
+ MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
32
+ ]
33
+ return RunSpec(
34
+ name="air_bench_2024",
35
+ scenario_spec=scenario_spec,
36
+ adapter_spec=adapter_spec,
37
+ metric_specs=metric_specs,
38
+ annotators=annotator_specs,
39
+ groups=["air_bench_2024"],
40
+ )
@@ -24,6 +24,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
24
24
  get_ranking_binary_adapter_spec,
25
25
  get_summarization_adapter_spec,
26
26
  )
27
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
27
28
  from helm.benchmark.metrics.common_metric_specs import (
28
29
  get_basic_metric_specs,
29
30
  get_bias_metric_specs,
@@ -1166,8 +1167,6 @@ def get_pubmed_qa_spec() -> RunSpec:
1166
1167
 
1167
1168
  @run_spec_function("live_qa")
1168
1169
  def get_live_qa_spec() -> RunSpec:
1169
- from helm.common.gpu_utils import get_torch_device_name
1170
-
1171
1170
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
1172
1171
 
1173
1172
  adapter_spec = get_generation_adapter_spec(
@@ -1177,22 +1176,23 @@ def get_live_qa_spec() -> RunSpec:
1177
1176
  max_train_instances=0,
1178
1177
  max_tokens=512,
1179
1178
  )
1179
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
1180
+ metric_specs = get_open_ended_generation_metric_specs() + [
1181
+ MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
1182
+ ]
1180
1183
 
1181
1184
  return RunSpec(
1182
1185
  name="live_qa",
1183
1186
  scenario_spec=scenario_spec,
1184
1187
  adapter_spec=adapter_spec,
1185
- metric_specs=get_summarization_metric_specs(
1186
- {"task": "live_qa", "device": get_torch_device_name()},
1187
- ),
1188
+ annotators=annotator_specs,
1189
+ metric_specs=metric_specs,
1188
1190
  groups=["live_qa"],
1189
1191
  )
1190
1192
 
1191
1193
 
1192
1194
  @run_spec_function("medication_qa")
1193
1195
  def get_medication_qa_spec() -> RunSpec:
1194
- from helm.common.gpu_utils import get_torch_device_name
1195
-
1196
1196
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
1197
1197
 
1198
1198
  adapter_spec = get_generation_adapter_spec(
@@ -1203,13 +1203,17 @@ def get_medication_qa_spec() -> RunSpec:
1203
1203
  max_tokens=512,
1204
1204
  )
1205
1205
 
1206
+ annotator_specs = [
1207
+ AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
1208
+ ]
1209
+ metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
1210
+
1206
1211
  return RunSpec(
1207
1212
  name="medication_qa",
1208
1213
  scenario_spec=scenario_spec,
1209
1214
  adapter_spec=adapter_spec,
1210
- metric_specs=get_summarization_metric_specs(
1211
- {"task": "medication_qa", "device": get_torch_device_name()},
1212
- ),
1215
+ annotators=annotator_specs,
1216
+ metric_specs=metric_specs,
1213
1217
  groups=["medication_qa"],
1214
1218
  )
1215
1219
 
@@ -1506,5 +1510,5 @@ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_J
1506
1510
  scenario_spec=scenario_spec,
1507
1511
  adapter_spec=adapter_spec,
1508
1512
  metric_specs=get_exact_match_metric_specs(),
1509
- groups=["thai_exam"],
1513
+ groups=["thai_exam", f"thai_exam_{exam}"],
1510
1514
  )
@@ -309,6 +309,8 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
309
309
  name="decodingtrust_toxicity_prompts",
310
310
  scenario_spec=scenario_spec,
311
311
  adapter_spec=adapter_spec,
312
- metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
312
+ metric_specs=get_generative_harms_metric_specs(
313
+ include_basic_metrics=True, include_generative_harms_metrics=True
314
+ ),
313
315
  groups=["decodingtrust", "toxicity_prompts"],
314
316
  )
@@ -0,0 +1,33 @@
1
+ """Run specs for experiments only.
2
+
3
+ These run specs are not intended for use with public leaderboards."""
4
+
5
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
6
+ from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
7
+ from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
8
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
9
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
10
+
11
+
12
+ @run_spec_function("ci_mcqa")
13
+ def get_ci_mcqa_spec() -> RunSpec:
14
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ci_mcqa_scenario.CIMCQAScenario", args={})
15
+
16
+ adapter_spec = get_multiple_choice_adapter_spec(
17
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
18
+ instructions=(
19
+ "Give a letter answer among the options given. "
20
+ "For example, if the options are A, B, C, D, E, and F, "
21
+ "your answer should consist of the single letter that corresponds to the correct answer."
22
+ ),
23
+ input_noun="Question",
24
+ output_noun="Answer",
25
+ )
26
+
27
+ return RunSpec(
28
+ name="ci_mcqa",
29
+ scenario_spec=scenario_spec,
30
+ adapter_spec=adapter_spec,
31
+ metric_specs=get_exact_match_metric_specs(),
32
+ groups=["CIMCQA"],
33
+ )
@@ -0,0 +1,33 @@
1
+ """Run spec functions for the HELM Finance leaderboard.
2
+
3
+ Website: https://crfm.stanford.edu/helm/finance/"""
4
+
5
+ from helm.benchmark.adaptation.common_adapter_specs import (
6
+ get_generation_adapter_spec,
7
+ )
8
+ from helm.benchmark.metrics.common_metric_specs import (
9
+ get_basic_metric_specs,
10
+ )
11
+ from helm.benchmark.metrics.metric import MetricSpec
12
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
13
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
14
+
15
+
16
+ @run_spec_function("fin_qa")
17
+ def get_fin_qa_spec() -> RunSpec:
18
+ from helm.benchmark.scenarios.fin_qa_scenario import INSTRUCTIONS
19
+
20
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.fin_qa_scenario.FinQAScenario", args={})
21
+ adapter_spec = get_generation_adapter_spec(
22
+ instructions=INSTRUCTIONS, input_noun=None, output_noun="Program", max_tokens=100
23
+ )
24
+ metric_specs = get_basic_metric_specs([]) + [
25
+ MetricSpec(class_name="helm.benchmark.metrics.fin_qa_metrics.FinQAMetric")
26
+ ]
27
+ return RunSpec(
28
+ name="fin_qa",
29
+ scenario_spec=scenario_spec,
30
+ adapter_spec=adapter_spec,
31
+ metric_specs=metric_specs,
32
+ groups=["fin_qa"],
33
+ )