PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/live_qa_annotator.py +84 -0
helm/benchmark/annotation/medication_qa_annotator.py +81 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +16 -2
helm/benchmark/run_expander.py +77 -0
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
helm/benchmark/run_specs/experimental_run_specs.py +33 -0
helm/benchmark/run_specs/finance_run_specs.py +33 -0
helm/benchmark/run_specs/vlm_run_specs.py +168 -45
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_finance.yaml +143 -0
helm/benchmark/static/schema_image2structure.yaml +254 -111
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_tables.yaml +200 -0
helm/benchmark/static/schema_thai.yaml +223 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/anthropic_client.py +43 -9
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +71 -12
helm/clients/openai_client.py +9 -2
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +3 -3
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +84 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +19 -0
helm/config/model_deployments.yaml +412 -18
helm/config/model_metadata.yaml +447 -25
helm/config/tokenizer_configs.yaml +93 -1
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/services/server_service.py +1 -1
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +44 -2
helm/tokenizers/huggingface_tokenizer.py +36 -13
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0

helm/benchmark/run_specs/vlm_run_specs.py CHANGED Viewed

@@ -7,8 +7,8 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
     ADAPT_GENERATION_MULTIMODAL,
     ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
 )
+from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import DIFFICULTY_ALL
 from helm.benchmark.metrics.common_metric_specs import (
-    get_basic_reference_metric_specs,
     get_exact_match_metric_specs,
     get_generative_harms_metric_specs,
     get_basic_metric_specs,
@@ -30,6 +30,7 @@ def _get_generation_adapter_spec(
     output_prefix: str = "",
     output_suffix: str = "",
     max_tokens: int = 100,
+    max_train_instances: int = 0,
     stop_sequences: Optional[List[str]] = None,
 ) -> AdapterSpec:
     return AdapterSpec(
@@ -41,8 +42,7 @@ def _get_generation_adapter_spec(
         output_prefix=output_prefix,
         output_suffix=output_suffix,
         instance_prefix="\n",
-        # We focus on zero-shot evaluation for now as most open VLMs only support a single image input
-        max_train_instances=0,
+        max_train_instances=max_train_instances,
         num_outputs=1,
         max_tokens=max_tokens,
         stop_sequences=stop_sequences if stop_sequences is not None else [],
@@ -70,6 +70,13 @@ def _get_captioning_adapter_spec() -> AdapterSpec:
     )
+def get_open_end_answer_generation_adapter_spec():
+    return _get_generation_adapter_spec(
+        instructions="Follow the given instruction and give your complete answer.",
+        max_tokens=100,
+    )
 def _get_multiple_choice_joint_adapter_spec(
     input_noun: Optional[str],
     output_noun: str,
@@ -117,9 +124,8 @@ def _get_image2structure_metric_specs(
         metric_names = [
             AnnotatedImageMetrics.PIXEL_SIMILARITY,
             AnnotatedImageMetrics.FID_SIMILARITY,
-            AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY,
-            AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM2,
-            AnnotatedImageMetrics.BLOCK_EARTH_MOVER_SIMILARITY_NORM1,
+            AnnotatedImageMetrics.BLOCK_EMD,
+            AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
         ]
     if include_edit_similarity:
         metric_names.append(AnnotatedImageMetrics.EDIT_SIMILARITY)
@@ -136,7 +142,42 @@ def _get_image2structure_metric_specs(
             },
         ),
     ]
-    return metric_specs + get_basic_reference_metric_specs()
+    return metric_specs + get_basic_metric_specs([])
+def _get_prometheus_vision_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.prometheus_vision_critique_metrics.PrometheusVisionCritiqueMetric",
+            args={
+                "num_respondents": num_respondents,
+                "max_tokens": max_tokens,
+            },
+        )
+    ]
+def _get_gpt4v_critique_originality_metric_specs(num_respondents: int) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.gpt4v_originality_critique_metrics.GPT4VCritiqueMetric",
+            args={
+                "num_respondents": num_respondents,
+            },
+        )
+    ]
+def _get_vibe_eval_critique_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.reka_vibe_critique_metrics.RekaVibeCritiqueMetric",
+            args={
+                "num_respondents": num_respondents,
+                "max_tokens": max_tokens,
+            },
+        )
+    ]
 ############################################################
@@ -190,13 +231,23 @@ def get_chart2csv_spec() -> RunSpec:
 @run_spec_function("crossmodal_3600")
-def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec:
+def get_crossmodal_3600_spec(location: str, language: str, num_respondents: int) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.crossmodal_3600_scenario.Crossmodal3600Scenario",
         args={"location": location, "language": language},
     )
-    adapter_spec: AdapterSpec = _get_generation_adapter_spec(max_tokens=20)
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    adapter_spec: AdapterSpec = _get_generation_adapter_spec(
+        instructions="Answer the question with a complete sentence in plain words",
+        max_tokens=20,
+    )
+    metric_specs: List[MetricSpec] = (
+        _get_prometheus_vision_critique_metric_specs(
+            num_respondents=num_respondents,
+            max_tokens=200,
+        )
+        + _get_open_ended_generation_metric_specs()
+    )
     run_spec_name: str = "crossmodal_3600"
     return RunSpec(
@@ -209,12 +260,23 @@ def get_crossmodal_3600_spec(location: str, language: str) -> RunSpec:
 @run_spec_function("flickr30k")
-def get_flickr30k_spec() -> RunSpec:
+def get_flickr30k_spec(num_respondents: int) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.flickr30k_scenario.Flickr30KScenario", args={}
     )
-    adapter_spec: AdapterSpec = _get_captioning_adapter_spec()
-    metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
+    adapter_spec: AdapterSpec = _get_generation_adapter_spec(
+        instructions="Generate a caption for the following image in plain words. The caption should "
+        "be short and needs to be a complete sentence.",
+        max_tokens=30,
+        max_train_instances=0,
+    )
+    metric_specs: List[MetricSpec] = (
+        _get_prometheus_vision_critique_metric_specs(
+            num_respondents=num_respondents,
+            max_tokens=200,
+        )
+        + _get_open_ended_generation_metric_specs()
+    )
     run_spec_name: str = "flickr30k"
     return RunSpec(
@@ -232,7 +294,7 @@ def get_gqa_spec() -> RunSpec:
         class_name="helm.benchmark.scenarios.vision_language.gqa_scenario.GQAScenario", args={}
     )
     adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec(
-        instructions="Answer the question using a single word or phrase."
+        instructions="Answer the question using a single word."
     )
     metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
@@ -296,10 +358,14 @@ def get_mscoco_captioning_spec(long: bool = False) -> RunSpec:
     if long:
         adapter_spec = _get_generation_adapter_spec(
             instructions="Generate a long, detailed caption for the following image.",
-            max_tokens=150,
+            max_tokens=200,
         )
     else:
-        adapter_spec = _get_captioning_adapter_spec()
+        adapter_spec = _get_generation_adapter_spec(
+            instructions="Generate a caption for the following image. The caption should be short and does "
+            "not need to be a complete sentence.",
+            max_tokens=20,
+        )
     metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + _get_open_ended_generation_metric_specs()
     run_spec_name: str = "mscoco_captioning"
@@ -403,10 +469,12 @@ def get_vqa_spec() -> RunSpec:
 @run_spec_function("image2latex")
-def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
+def get_image2latex_spec(
+    subset: str, recompile_prompt: bool = False, difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None
+) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
-        args={"subset": subset, "recompile_prompt": recompile_prompt},
+        args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(
         instructions="Just give a short answer without answering in a complete sentence.",
@@ -415,7 +483,7 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
     metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
         generation_type="latex",
         args=args,
-        include_edit_similarity=True,
+        include_edit_similarity=(subset != "real"),
         size_handling_method="padding",
     )
     annotator_specs: List[AnnotatorSpec] = [
@@ -424,22 +492,32 @@ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Opti
         )
     ]
-    run_spec_name: str = "image2latex"
+    run_spec_name: str = f"image2latex:subset={subset}:difficulty={difficulty}"
+    groups: List[str]
+    if subset == "real":
+        groups = ["image2latex_real"]
+    else:
+        groups = ["image2latex", f"image2latex_{difficulty}"]
     return RunSpec(
-        name=f"{run_spec_name}:subset={subset}",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=[run_spec_name],
+        groups=groups,
         annotators=annotator_specs,
     )
 @run_spec_function("image2webpage")
-def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
+def get_image2webpage_spec(
+    subset: str,
+    recompile_prompt: bool = False,
+    difficulty: str = DIFFICULTY_ALL,
+    args: Optional[Dict] = None,
+) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
-        args={"subset": subset, "recompile_prompt": recompile_prompt},
+        args={"subset": subset, "recompile_prompt": recompile_prompt, "difficulty": difficulty},
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(
         instructions="Just give a short answer without answering in a complete sentence.",
@@ -448,7 +526,7 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
     metric_specs: List[MetricSpec] = _get_image2structure_metric_specs(
         generation_type="webpage",
         args=args,
-        include_edit_similarity=True,
+        include_edit_similarity=(subset != "real"),
         size_handling_method="none",
     )
     annotator_specs: List[AnnotatorSpec] = [
@@ -457,13 +535,18 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
         )
     ]
-    run_spec_name: str = "image2webpage"
+    run_spec_name: str = f"image2webpage:subset={subset}:difficulty={difficulty}"
+    groups: List[str]
+    if subset == "real":
+        groups = ["image2webpage_real"]
+    else:
+        groups = ["image2webpage", f"image2webpage_{difficulty}"]
     return RunSpec(
-        name=f"{run_spec_name}:subset={subset}",
+        name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=[run_spec_name],
+        groups=groups,
         annotators=annotator_specs,
     )
@@ -477,7 +560,9 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
     adapter_spec: AdapterSpec
     if question_type == "free_form":
-        adapter_spec = _get_short_answer_generation_adapter_spec()
+        adapter_spec = _get_short_answer_generation_adapter_spec(
+            instructions="Just give the numerical answer without showing the steps, the unit, or percentage symbol."
+        )
     elif question_type == "multi_choice":
         adapter_spec = _get_multiple_choice_joint_adapter_spec(
             input_noun=None, output_noun="Answer", max_train_instances=0
@@ -497,10 +582,11 @@ def get_math_vista_spec(grade: str, question_type: str) -> RunSpec:
 @run_spec_function("image2musicsheet")
-def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
+def get_image2musicsheet_spec(difficulty: str = DIFFICULTY_ALL, args: Optional[Dict] = None) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
-        args={"subset": "music", "recompile_prompt": False},  # There os only one subset for music sheets
+        # There os only one subset for music sheets
+        args={"subset": "music", "recompile_prompt": False, "difficulty": difficulty},
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(
         instructions="Just give a short answer without answering in a complete sentence.",
@@ -518,13 +604,14 @@ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
         )
     ]
-    run_spec_name: str = "image2musicsheet"
+    run_spec_name: str = f"image2musicsheet:difficulty={difficulty}"
+    groups: List[str] = ["image2musicsheet", f"image2musicsheet_{difficulty}"]
     return RunSpec(
         name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=[run_spec_name],
+        groups=groups,
         annotators=annotator_specs,
     )
@@ -568,13 +655,14 @@ def get_unicorn_spec(subject: str) -> RunSpec:
         args={"subject": subject},
     )
     adapter_spec: AdapterSpec = _get_generation_adapter_spec(
-        instructions="Only give numerical or boolean answer without an explanation."
+        instructions="Only give a yes/no or numerical answer without an explanation.",
+        max_tokens=1,  # the model may generate answer with a period
     )
     metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
     run_spec_name: str = "unicorn"
     return RunSpec(
-        name=run_spec_name,
+        name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
@@ -583,16 +671,26 @@ def get_unicorn_spec(subject: str) -> RunSpec:
 @run_spec_function("bingo")
-def get_bingo_spec(subject: str) -> RunSpec:
+def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
     )
-    adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec()
-    metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
+    adapter_spec: AdapterSpec = _get_generation_adapter_spec(
+        instructions="Answer the question with a complete and clear explanation in sentences without listing it out.",
+        max_tokens=100,
+        max_train_instances=0,
+    )
+    metric_specs: List[MetricSpec] = (
+        _get_prometheus_vision_critique_metric_specs(
+            num_respondents=num_respondents,
+            max_tokens=200,
+        )
+        + _get_open_ended_generation_metric_specs()
+    )
     run_spec_name: str = "bingo"
     return RunSpec(
-        name=run_spec_name,
+        name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
@@ -661,7 +759,7 @@ def get_seed_bench_spec(subject: str) -> RunSpec:
     run_spec_name: str = "seed_bench"
     return RunSpec(
-        name=run_spec_name,
+        name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
@@ -682,7 +780,7 @@ def get_mme_spec(subject: str) -> RunSpec:
     run_spec_name: str = "mme"
     return RunSpec(
-        name=run_spec_name,
+        name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
@@ -739,17 +837,42 @@ def get_pairs_spec(subset: str, person: str) -> RunSpec:
 @run_spec_function("mementos")
-def get_mementos_spec(subject: str) -> RunSpec:
+def get_mementos_spec(subject: str, num_respondents: int) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
         args={"subject": subject},
     )
-    adapter_spec: AdapterSpec = _get_short_answer_generation_adapter_spec()
-    metric_specs: List[MetricSpec] = _get_open_ended_generation_metric_specs()
+    adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
+    metric_specs: List[MetricSpec] = (
+        _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
+        + _get_open_ended_generation_metric_specs()
+    )
     run_spec_name: str = "mementos"
     return RunSpec(
-        name=run_spec_name,
+        name=f"{run_spec_name}:subject={subject}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+@run_spec_function("vibe_eval")
+def get_vibe_eval_spec(subject: str, num_respondents: int) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.vibe_eval_scenario.VibeEvalScenario",
+        args={"subject": subject},
+    )
+    adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
+    metric_specs: List[MetricSpec] = (
+        _get_prometheus_vision_critique_metric_specs(num_respondents=num_respondents, max_tokens=200)
+        + _get_open_ended_generation_metric_specs()
+    )
+    run_spec_name: str = "vibe_eval"
+    return RunSpec(
+        name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,

helm/benchmark/scenarios/air_bench_scenario.py ADDED Viewed

@@ -0,0 +1,50 @@
+import datasets
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    Input,
+    Output,
+)
+from helm.common.general import ensure_directory_exists
+class AIRBench2024Scenario(Scenario):
+    """AIRBench 2024
+    Pre-publication: References will be added post-publication.
+    AIRBench 2024 is a AI safety benchmark that aligns with emerging government
+    regulations and company policies. It consists of 5,619 malicious prompts
+    spanning categories of the regulation-based safety categories in the
+    AIR 2024 safety taxonomy."""
+    name = "air_bench_2024"
+    description = (
+        "AIRBench 2024 is a AI safety benchmark that aligns with "
+        "emerging government regulations and company policies"
+    )
+    tags = ["safety"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        # TODO: Switch this to the production dataset when available.
+        dataset = datasets.load_dataset("stanford-crfm/air-bench-2024", split="test", cache_dir=cache_dir)
+        instances: List[Instance] = []
+        # TODO: Allow users to filter by category
+        for row in dataset:
+            input = Input(text=row["prompt"])
+            # References are category ID, followed by level 2, 3 and 4 category names.
+            references = [
+                Reference(output=Output(text=row[column_name]), tags=[])
+                for column_name in ["cate-idx", "l2-name", "l3-name", "l4-name"]
+            ]
+            instance = Instance(input=input, references=references, split=TEST_SPLIT)
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/ci_mcqa_scenario.py ADDED Viewed

@@ -0,0 +1,80 @@
+import json
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    Input,
+    Output,
+)
+class CIMCQAScenario(Scenario):
+    """CIMCQA is a multiple-choice question answering (MCQA) dataset designed to
+    study concept inventories in CS Education.
+    This is used by a pre-publication paper.
+    NOTE: This code is for archival purposes only. The scenario cannot be run because it requires
+    private data. Please contact the paper authors for more information."""
+    DATASET_DOWNLOAD_URL: str = "https://drive.google.com/uc?export=download&id=1siYjhDiasI5FIiS0ckLbo40UnOj8EU2h"
+    name = "ci_mcqa"
+    description = (
+        "CIMCQA is a multiple-choice question answering (MCQA) dataset designed to"
+        "study concept inventories in CS Education."
+    )
+    tags = ["question_answering"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path: str = os.path.join("restricted", "bdsi_multiple_answers_removed.json")
+        assert os.path.exists(data_path)
+        with open(data_path, "r", encoding="utf8") as f:
+            data = json.load(f)
+        # Data is a list of dictionaries now, each one a question and its associated answers and metadata.
+        instances: List[Instance] = list()
+        # UNCOMMENT BELOW FOR FEW-SHOT RUN
+        training_data_path: str = os.path.join("restricted", "mock_bdsi_multiple_answers_removed.json")
+        assert os.path.exists(training_data_path)
+        with open(training_data_path, "r", encoding="utf8") as f:
+            training_data = json.load(f)
+        for question in training_data:
+            question_text = question["question"]
+            references = list()
+            for index, answer in enumerate(question["options"]):
+                reference_answer = Output(text=answer)
+                # Correct option offset by 1 due to zero-indexing
+                tag = [CORRECT_TAG] if index == question["correct_option"] - 1 else []
+                references.append(Reference(reference_answer, tags=tag))
+            instance = Instance(
+                input=Input(text=question_text),
+                references=references,
+                split=TRAIN_SPLIT,
+            )
+            instances.append(instance)
+        for question in data:
+            question_text = question["question"]
+            references = list()
+            for index, answer in enumerate(question["options"]):
+                reference_answer = Output(text=answer)
+                # Correct option offset by 1 due to zero-indexing
+                tag = [CORRECT_TAG] if index == question["correct_option"] - 1 else []
+                references.append(Reference(reference_answer, tags=tag))
+            instance = Instance(
+                input=Input(text=question_text),
+                references=references,
+                split=TEST_SPLIT,  # Just doing zero shot to start
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/entity_data_imputation_scenario.py CHANGED Viewed

@@ -41,8 +41,14 @@ class EntityDataImputationScenario(Scenario):
     def __init__(self, dataset: str, seed: int = 1234):
         super().__init__()
         self.datasets_paths = {
-            "Buy": "https://dbs.uni-leipzig.de/file/Abt-Buy.zip",
-            "Restaurant": "https://www.cs.utexas.edu/users/ml/riddle/data/restaurant.tar.gz",
+            "Buy": (
+                "https://storage.googleapis.com/crfm-helm-public/source_datasets/scenarios/"
+                "entity_data_imputation/Abt-Buy.zip"
+            ),
+            "Restaurant": (
+                "https://storage.googleapis.com/crfm-helm-public/source_datasets/scenarios/"
+                "entity_data_imputation/restaurant.tar.gz"
+            ),
         }
         # Columns to impute
         self.datasets_impute_col = {

crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl