PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
helm/benchmark/adaptation/common_adapter_specs.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
helm/benchmark/annotation/call_center_annotator.py +247 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +68 -0
helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
helm/benchmark/annotation/live_qa_annotator.py +71 -0
helm/benchmark/annotation/medication_qa_annotator.py +68 -0
helm/benchmark/annotation/model_as_judge.py +45 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
helm/benchmark/annotation/xstest_annotator.py +110 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/bhasa_metrics.py +188 -0
helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
helm/benchmark/metrics/code_metrics_helper.py +11 -1
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/safety_metrics.py +57 -0
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
helm/benchmark/metrics/vision_language/image_utils.py +1 -1
helm/benchmark/model_metadata_registry.py +3 -3
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_run_entry.py +1 -0
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +31 -2
helm/benchmark/run_expander.py +113 -10
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
helm/benchmark/run_specs/call_center_run_specs.py +152 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
helm/benchmark/run_specs/experimental_run_specs.py +85 -0
helm/benchmark/run_specs/finance_run_specs.py +110 -0
helm/benchmark/run_specs/safety_run_specs.py +154 -0
helm/benchmark/run_specs/vlm_run_specs.py +251 -57
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
helm/benchmark/scenarios/banking77_scenario.py +51 -0
helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
helm/benchmark/scenarios/financebench_scenario.py +53 -0
helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
helm/benchmark/scenarios/scenario.py +1 -1
helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +2 -8
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +1 -6
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_bhasa.yaml +709 -0
helm/benchmark/static/schema_call_center.yaml +232 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +189 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_safety.yaml +247 -0
helm/benchmark/static/schema_tables.yaml +317 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/test_openai_window_service.py +8 -8
helm/clients/ai21_client.py +71 -1
helm/clients/anthropic_client.py +50 -28
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +79 -19
helm/clients/nvidia_nim_client.py +35 -0
helm/clients/openai_client.py +11 -5
helm/clients/palmyra_client.py +25 -0
helm/clients/perspective_api_client.py +11 -6
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +7 -9
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/open_flamingo_client.py +1 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +99 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +25 -0
helm/common/mongo_key_value_store.py +2 -1
helm/common/request.py +16 -0
helm/config/model_deployments.yaml +740 -363
helm/config/model_metadata.yaml +824 -128
helm/config/tokenizer_configs.yaml +207 -10
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/example_queries.py +14 -21
helm/proxy/services/server_service.py +2 -3
helm/proxy/token_counters/test_auto_token_counter.py +2 -2
helm/tokenizers/ai21_tokenizer.py +51 -59
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +29 -62
helm/tokenizers/huggingface_tokenizer.py +35 -13
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/general.js +0 -122
helm/benchmark/static/images/crfm-logo.png +0 -0
helm/benchmark/static/images/helm-logo-simple.png +0 -0
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/info-icon.png +0 -0
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/schema_image2structure.yaml +0 -304
helm/benchmark/static/utils.js +0 -285
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
helm/benchmark/window_services/ai21_window_service.py +0 -247
helm/benchmark/window_services/cohere_window_service.py +0 -101
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -75
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -327
helm/tokenizers/ice_tokenizer.py +0 -30
helm/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0

helm/benchmark/scenarios/test_financebench_scenario.py ADDED Viewed

@@ -0,0 +1,26 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.financebench_scenario import FinanceBenchScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, TEST_SPLIT, TRAIN_SPLIT
+@pytest.mark.scenarios
+def test_air_2024_scenario_get_instances():
+    scenario = FinanceBenchScenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = scenario.get_instances(tmpdir)
+    assert len(instances) == 150
+    assert len([instance for instance in instances if instance.split == TRAIN_SPLIT]) == 10
+    assert (
+        "Evidence: Table of Contents \n3M Company and Subsidiaries\nConsolidated Statement of Cash Flow s\n"  # noqa: E501
+        in instances[0].input.text
+    )
+    assert (
+        "Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement."  # noqa: E501
+        in instances[0].input.text
+    )
+    assert len(instances[0].references) == 1
+    assert instances[0].references[0].output.text == "$1577.00"
+    assert instances[0].references[0].tags == [CORRECT_TAG]
+    assert instances[0].split == TEST_SPLIT

helm/benchmark/scenarios/test_gsm_scenario.py ADDED Viewed

@@ -0,0 +1,31 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.gsm_scenario import GSM8KScenario
+from helm.benchmark.scenarios.scenario import Input, Output, Reference
+@pytest.mark.scenarios
+def test_gsm_scenario_get_instances():
+    math_scenario = GSM8KScenario()
+    with TemporaryDirectory() as tmpdir:
+        actual_instances = math_scenario.get_instances(tmpdir)
+    assert len(actual_instances) == 8792
+    assert actual_instances[0].input == Input(
+        text=(
+            "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many"
+            " clips did Natalia sell altogether in April and May?"
+        )
+    )
+    assert actual_instances[0].references == [
+        Reference(
+            output=Output(
+                text=(
+                    "Natalia sold 48/2 = <<48/2=24>>24 clips in May. Natalia sold 48+24 = <<48+24=72>>72 clips"
+                    " altogether in April and May. The answer is 72."
+                )
+            ),
+            tags=["correct"],
+        )
+    ]
+    assert actual_instances[0].split == "train"

helm/benchmark/scenarios/test_legalbench_scenario.py ADDED Viewed

@@ -0,0 +1,30 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.legalbench_scenario import LegalBenchScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
+@pytest.mark.scenarios
+def test_legalbench_scenario():
+    scenario = LegalBenchScenario(subset="abercrombie")
+    with TemporaryDirectory() as tmpdir:
+        instances = scenario.get_instances(tmpdir)
+    assert len(instances) == 100
+    assert instances[0].input == Input(text='Description: The mark "Ivory" for a product made of elephant tusks.')
+    assert instances[0].references == [
+        Reference(output=Output(text="generic"), tags=["correct"]),
+    ]
+    assert instances[0].split == "train"
+    scenario = LegalBenchScenario(subset="proa")
+    with TemporaryDirectory() as tmpdir:
+        instances = scenario.get_instances(tmpdir)
+    assert len(instances) == 100
+    assert instances[0].input == Input(
+        text="Statute: Amendments to pleadings must be filed within 15 days of the filing of the initial pleading."
+    )
+    assert instances[0].references == [
+        Reference(output=Output(text="No"), tags=[CORRECT_TAG]),
+    ]
+    assert instances[0].split == "train"

helm/benchmark/scenarios/test_math_scenario.py CHANGED Viewed

@@ -5,18 +5,12 @@ from helm.benchmark.scenarios.math_scenario import MATHScenario
 from helm.benchmark.scenarios.scenario import Input, Output, Reference
-# TODO: Fix the test for newer versions of diffusers: https://github.com/stanford-crfm/helm/issues/2168
-@pytest.mark.skip(
-    reason="Incompatible with newer versions with diffusers>0.24.0. Fails with "
-    '"Loading a dataset cached in a LocalFileSystem is not supported"'
-)
+@pytest.mark.scenarios
 def test_math_scenario_get_instances():
     math_scenario = MATHScenario(subject="number_theory", level="1")
     with TemporaryDirectory() as tmpdir:
         actual_instances = math_scenario.get_instances(tmpdir)
     assert len(actual_instances) == 77
     assert actual_instances[0].input == Input(text="What is the remainder when (99)(101) is divided by 9?")
-    assert actual_instances[0].references == [
-        Reference(output=Output(text="0", multimedia_content=None), tags=["correct"])
-    ]
+    assert actual_instances[0].references == [Reference(output=Output(text="0"), tags=["correct"])]
     assert actual_instances[0].split == "train"

helm/benchmark/scenarios/test_med_qa_scenario.py ADDED Viewed

@@ -0,0 +1,30 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.med_qa_scenario import MedQAScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
+@pytest.mark.scenarios
+def test_med_qa_scenario():
+    scenario = MedQAScenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = scenario.get_instances(tmpdir)
+    assert len(instances) == 12723
+    assert instances[0].input == Input(
+        text=(
+            "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it"
+            " started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She"
+            " otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C),"
+            " blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on"
+            " room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus."
+            " Which of the following is the best treatment for this patient?"
+        )
+    )
+    assert instances[0].references == [
+        Reference(output=Output(text="Ampicillin"), tags=[]),
+        Reference(output=Output(text="Ceftriaxone"), tags=[]),
+        Reference(output=Output(text="Doxycycline"), tags=[]),
+        Reference(output=Output(text="Nitrofurantoin"), tags=[CORRECT_TAG]),
+    ]
+    assert instances[0].split == "train"

helm/benchmark/scenarios/test_mmlu_scenario.py ADDED Viewed

@@ -0,0 +1,33 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.mmlu_scenario import MMLUScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
+@pytest.mark.scenarios
+def test_mmlu_scenario():
+    with TemporaryDirectory() as tmpdir:
+        scenario = MMLUScenario(subject="abstract_algebra")
+        instances = scenario.get_instances(tmpdir)
+        assert len(instances) == 116
+        assert instances[0].input == Input(text="Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.")
+        assert instances[0].references == [
+            Reference(output=Output(text="0"), tags=[]),
+            Reference(output=Output(text="1"), tags=[CORRECT_TAG]),
+            Reference(output=Output(text="2"), tags=[]),
+            Reference(output=Output(text="3"), tags=[]),
+        ]
+        assert instances[0].split == "train"
+        scenario = MMLUScenario(subject="anatomy")
+        instances = scenario.get_instances(tmpdir)
+        assert len(instances) == 154
+        assert instances[0].input == Input(text="What is the embryological origin of the hyoid bone?")
+        assert instances[0].references == [
+            Reference(output=Output(text="The first pharyngeal arch"), tags=[]),
+            Reference(output=Output(text="The first and second pharyngeal arches"), tags=[]),
+            Reference(output=Output(text="The second pharyngeal arch"), tags=[]),
+            Reference(output=Output(text="The second and third pharyngeal arches"), tags=[CORRECT_TAG]),
+        ]
+        assert instances[0].split == "train"

helm/benchmark/scenarios/test_narrativeqa_scenario.py ADDED Viewed

@@ -0,0 +1,73 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.narrativeqa_scenario import NarrativeQAScenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Output, Reference
+@pytest.mark.scenarios
+def test_narrativeqa_scenario():
+    scenario = NarrativeQAScenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = scenario.get_instances(tmpdir)
+    assert len(instances) == 1572
+    assert (
+        instances[0].input.text
+        == "At Madeline Hall, an old mansion-house near Southampton belonging to the wealthy de Versely family, lives"
+        " an elderly spinster Miss Delmar, the aunt of the earl de Versely and Captain Delmar. Miss Delmar invites"
+        " Arabella Mason, the daughter of a deceased, well-liked steward to stay with her as a lower-class guest in"
+        " the house. Captain Delmar is known to visit his aunt at Madeline Hall frequently, accompanied by his"
+        " valet Ben Keene, who is also a private marine. Captain Delmar eventually suggests that Ben should propose"
+        " to Arabella, and the two marry in secret, to the frustration of Miss Delmar and Arabella's mother. The"
+        " captain is able to smooth over the situation with his aunt, even after it is discovered that Arabella was"
+        " six months pregnant at the time of the marriage. She later gives birth to a boy, who takes the Captain's"
+        " Christian name and Ben's surname--the titular Percival Keene.\nThe family moves to Chatham, after Ben is"
+        " ordered back with his detachment. Arabella opens up a successful shop and circulating library below her"
+        " house, enlisting the help of her mother and sister, Amelia. Percival becomes well known in town from his"
+        " mischievous pranks on officers and other strangers, often encouraged by his aunt Amelia. However,"
+        " Percival's mother and grandmother are less fond of his disregard for manners, and insist on sending him"
+        " to school after an episode in which he bites his grandmother. Percival reports to the school house of Mr."
+        " O'Gallagher, a poor Irish scholar, who rules his class with a system of severe corporal punishment. Mr."
+        " O'Gallagher routinely bullies Percival by stealing his lunch, leading Percival to seek revenge by"
+        " poisoning his sandwiches with calomel. On Guy Fawkes Day the schoolteacher confiscates all the"
+        " schoolboys' fireworks, for which Percival retaliates by setting off the collected fireworks while the"
+        " teacher sits above them, leading to the total destruction of the schoolhouse and near death of the"
+        " schoolmaster.\nWhen Percival is a young teenager, Captain Delmar reappears and offers him a position"
+        " aboard his new navy ship, the H.M. Calliope. While preparing to enter service, Percival overhears gossip"
+        " of his illegitimate birth, introducing the idea that Captain Delmar may be his father. He confronts his"
+        " mother about his parentage, which she at first harshly denies but later tearfully explains the truth of"
+        " her affair. Early in his service in the navy, Percival is captured during a pirate raid along with"
+        " others. The pirate crew is entirely black, and the captain explains that they are primarily escaped"
+        " slaves from the Americas. Percival is taken in as a cabin boy, and later dyes his skin tan in the"
+        " appearance of a mulatto to please the captain who doesn't approve of white skin. The pirates often seek"
+        " to take over slave trading vessels, killing every white person on board. During the taking of one such"
+        " vessel, Percival is able is convince the captain to spare the lives of a wealthy Dutch merchant and his"
+        " young daughter, Minnie. Eventually the H.M. Calliope takes the pirate ship, and Percival--unrecognizable"
+        " with his dyed skin--is taken as a prisoner, later to convince his fellow shipman of his true"
+        " identity.\nAfter his reappearance aboard the ship, Percival gains esteem among the crew and is welcomed"
+        " back by the emotional Captain Delmar. His reputation continues to grow over the course of his service in"
+        " conflicts with Dutch and French vessels around the island of Curacao. He also stands in for an ill"
+        " Captain Delmar in a duel with a French officer, effectively saving the captain's life. At this point, the"
+        " captain receives news that his older brother has died, making him the new Lord de Versely, and before"
+        " returning to England he grants Perceval command of his own schooner. After another intense but successful"
+        " battle with a French war ship, Percival is promoted to captain. During his service in the Navy, Percival"
+        " still partakes in the merry pranks of his youth, and at one point teams up with a mulatto hotel owner in"
+        " Curaรงao to convince his fellow officers they've been poisoned. He also keeps correspondence with Minnie,"
+        " developing a romance with the beautiful heiress.\nNear the end of the story, Percival guides his crew"
+        " through a terrible storm in which many of the crew are killed and the ship is heavily damaged. After"
+        " being saved by another English vessel, he receives a letter informing him of Lord de Versely's sudden"
+        " death from heart complications and learns that he has been left all of his personal property. Percival is"
+        " still disappointed that he can not take his father's name. He later journey's with his friend Bob Cross"
+        " to Hamburg to reunite with Minnie, but is captured by French troops on the road and sentenced to"
+        " execution for spying. During a skirmish between the French and the Cossacks, Percival and Cross are able"
+        " to escape and continue on the road. At the end of the novel, Percival proposes to Minnie, and stands to"
+        " inherit a great fortune through her father. He also receives a letter from the de Versely attorney"
+        " letting him know he has been granted the arms and name of Delmar.\nQuestion: Who did Percival reunited"
+        " with?"
+    )
+    assert instances[0].references == [
+        Reference(output=Output(text="Minnie"), tags=[CORRECT_TAG]),
+        Reference(output=Output(text="minnie"), tags=[CORRECT_TAG]),
+    ]
+    assert instances[0].split == "train"

helm/benchmark/scenarios/thai_exam_scenario.py CHANGED Viewed

@@ -86,9 +86,9 @@ class ThaiExamScenario(Scenario):
         super().__init__()
         self.exam = exam
-    def download_thai_exam(self, path: str):
+    def download_thai_exam(self, path: str, revision: str):
         ensure_file_downloaded(
-            "https://storage.googleapis.com/thai_dataset/thai_exam.tar.gz",
+            f"https://huggingface.co/datasets/scb10x/thai_exam/resolve/{revision}/thai_exam.tar.gz",
             target_path=path,
             unpack=True,
         )
@@ -118,8 +118,8 @@ class ThaiExamScenario(Scenario):
     def get_instances(self, output_path) -> List[Instance]:
         data_path: str = os.path.join(output_path, "data")
-        self.download_thai_exam(data_path)
+        # ThaiExam (v1.0) revision = d78aef04ea3cc5095545e6951cb39e17c64e26a1
+        self.download_thai_exam(data_path, revision="d78aef04ea3cc5095545e6951cb39e17c64e26a1")
         instances: List[Instance] = []
         splits: Dict[str, str] = {
             "train": TRAIN_SPLIT,

helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py CHANGED Viewed

@@ -42,7 +42,7 @@ class AOKVQAScenario(Scenario):
     name = "a_okvqa"
     description = (
         "A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of "
-        "commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718))."
+        "commonsense and world knowledge to answer ([Schwenk et al., 2022](https://arxiv.org/abs/2206.01718))."
     )
     tags = ["vision-language", "knowledge", "reasoning"]

helm/benchmark/scenarios/vision_language/bingo_scenario.py CHANGED Viewed

@@ -43,7 +43,7 @@ class BingoScenario(Scenario):
     Paper: https://arxiv.org/abs/2311.03287
     """
-    BINGO_HUGGINGFACE_DATASET_NAME: str = "PahaII/Bingo"
+    BINGO_HUGGINGFACE_DATASET_URL: str = "https://huggingface.co/datasets/PahaII/Bingo/resolve/main"
     IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/Bingo/resolve/main/images/{image_path}?download=true"
@@ -51,8 +51,8 @@ class BingoScenario(Scenario):
     name = "bingo"
     description = (
-        "Evaluate multimodal models on biased and inference-challenging scenarios with five subjects"
-        " ([paper](https://arxiv.org/abs/2311.03287))."
+        "Evaluate multimodal models on biased and inference-challenging scenarios with five subjects "
+        "([Cui et al., 2023](https://arxiv.org/abs/2311.03287))."
     )
     tags = ["vision-language"]
@@ -67,12 +67,12 @@ class BingoScenario(Scenario):
         # There is only the test split in Unicorn benchmark
         instances: List[Instance] = []
-        question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
+        question_data_files = {TEST_SPLIT: f"{self.BINGO_HUGGINGFACE_DATASET_URL}/{self._subject}.json"}
         # Process the test set
         for row in tqdm(
             load_dataset(
-                self.BINGO_HUGGINGFACE_DATASET_NAME,
+                "json",
                 data_files=question_data_files,
                 split=TEST_SPLIT,
                 cache_dir=output_path,

helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py CHANGED Viewed

@@ -75,7 +75,8 @@ class Crossmodal3600Scenario(Scenario):
     name = "crossmodal_3600"
     description = (
         "Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated "
-        "with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))."
+        "with human-generated reference captions in 36 languages. "
+        "([Thapliyal et al., 2022)](https://arxiv.org/abs/2205.12522))."
     )
     tags = ["vision-language", "multilinguality"]

helm/benchmark/scenarios/vision_language/exams_v_scenario.py ADDED Viewed

@@ -0,0 +1,104 @@
+from typing import List, Set
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TEST_SPLIT,
+    TRAIN_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+class ExamsVScenario(Scenario):
+    """
+    EXAMS-V: A Multi-Discipline Multilingual Multimodal Exam Benchmark for Evaluating Vision Language Models
+    A challenging multi-discipline multimodal multilingual exam benchmark for evaluating vision language models.
+    It consists of 20,932 multiple-choice questions across 20 school disciplines covering natural science,
+    social science, and other miscellaneous studies, e.g.,religion, fine arts, business, etc.
+    Paper: https://arxiv.org/abs/2403.10378
+    Website: https://huggingface.co/datasets/Rocktim/EXAMS-V
+    """
+    HUGGINGFACE_DATASET_NAME: str = "Rocktim/EXAMS-V"
+    VALID_LANGUAGES: Set[str] = {
+        "Chinese",
+        "Croation",
+        "Italian",
+        "Hungarian",
+        "Arabic",
+        "Serbian",
+        "Bulgarian",
+        "English",
+        "German",
+        "French",
+        "Spanish",
+        "Polish",
+    }
+    VALID_SUBJECT_GROUP: Set[str] = {
+        "Natural Science",
+        "Social Sciences",
+        "Other",
+    }
+    VALID_TYPES: Set[str] = {"text", "image_text"}
+    name = "exams_v"
+    description = (
+        "Multimodal and Multilingual benchmark to evaluate vision-language models across 20 school disciplines "
+        "covering natural science, social science, and other miscellaneous studies "
+        "([Das et al., 2024]( https://arxiv.org/abs/2403.10378))."
+    )
+    tags = ["vision-language", "knowledge", "reasoning", "multilingual"]
+    def __init__(self, language: str, subject_grouped: str, type: str) -> None:
+        super().__init__()
+        subject_grouped = subject_grouped.replace("_", " ")
+        assert subject_grouped in self.VALID_SUBJECT_GROUP, f"Invalid subject_grouped: {subject_grouped}"
+        assert type in self.VALID_TYPES, f"Invalid type: {type}"
+        assert language in self.VALID_LANGUAGES, f"Invalid language: {language}"
+        self._language: str = language
+        self._subject_grouped: str = subject_grouped
+        self._type: str = type
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        for split in [TRAIN_SPLIT, TEST_SPLIT]:
+            for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=split, cache_dir=output_path)):
+                language: str = row["language"]
+                subject_grouped: str = row["subject_grouped"]
+                type: str = row["type"]
+                # Exclude examples that do not match the specified language, subject, and type
+                if language != self._language or subject_grouped != self._subject_grouped or type != self._type:
+                    continue
+                # Save the image to disk
+                image = row["image"]
+                image_file_name: str = generate_hash(image) + ".jpg"
+                local_image_path: str = os.path.join(output_path, image_file_name)
+                if not os.path.exists(local_image_path):
+                    image.convert("RGB").save(local_image_path)
+                content: List[MediaObject] = [
+                    MediaObject(location=local_image_path, content_type="image/jpeg"),
+                ]
+                references: List[Reference] = [Reference(output=Output(text=row["answer_key"]), tags=[CORRECT_TAG])]
+                instances.append(
+                    Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=split)
+                )
+        return instances

helm/benchmark/scenarios/vision_language/fair_face_scenario.py ADDED Viewed

@@ -0,0 +1,136 @@
+from typing import List
+import os
+from datasets import load_dataset
+from tqdm import tqdm
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    VALID_SPLIT,
+    TRAIN_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.images_utils import generate_hash
+class FairFaceScenario(Scenario):
+    """
+    Identify the race, gender or age of a photo of a person.
+    @misc{kärkkäinen2019fairfacefaceattributedataset,
+      title={FairFace: Face Attribute Dataset for Balanced Race, Gender, and Age},
+      author={Kimmo Kärkkäinen and Jungseock Joo},
+      year={2019},
+      eprint={1908.04913},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/1908.04913},
+    }
+    Paper: https://arxiv.org/abs/1908.04913
+    Website: https://huggingface.co/datasets/HuggingFaceM4/FairFace
+    """
+    HUGGINGFACE_DATASET_NAME: str = "HuggingFaceM4/FairFace"
+    RACE_ATTRIBUTE: str = "race"
+    GENDER_ATTRIBUTE: str = "gender"
+    AGE_ATTRIBUTE: str = "age"
+    QUESTION_TEMPLATE: str = "Identify the {attribute} of the person in the image."
+    RACE_SUBGROUPS: List[str] = [
+        "East Asian",
+        "Indian",
+        "Black",
+        "White",
+        "Middle Eastern",
+        "Latino Hispanic",
+        "Southeast Asian",
+    ]
+    GENDER_SUBGROUPS: List[str] = ["Male", "Female"]
+    AGE_SUBGROUPS: List[str] = [
+        "0-2 years",
+        "3-9 years",
+        "10-19 years",
+        "20-29 years",
+        "30-39 years",
+        "40-49 years",
+        "50-59 years",
+        "60-69 years",
+        "Over 70 years",
+    ]
+    name = "fair_face"
+    description = (
+        "Identify the race, gender or age of a photo of a person "
+        "([Karkkainen et al., 2019](https://arxiv.org/abs/1908.04913))."
+    )
+    tags = ["vision-language", "fairness"]
+    def __init__(self, attribute: str, subgroup: str) -> None:
+        super().__init__()
+        subgroups: List[str]
+        if attribute == self.RACE_ATTRIBUTE:
+            subgroups = self.RACE_SUBGROUPS
+        elif attribute == self.GENDER_ATTRIBUTE:
+            subgroups = self.GENDER_SUBGROUPS
+        elif attribute == self.AGE_ATTRIBUTE:
+            subgroups = self.AGE_SUBGROUPS
+        else:
+            raise ValueError(f"Invalid attribute: {attribute}")
+        # Validate the value passed in for the subgroup argument and set possible subgroup choices.
+        # The subgroup passed in has a _ for spaces in the string.
+        subgroup = subgroup.replace("_", " ")
+        assert subgroup in subgroups, f"Invalid subgroup for {attribute} attribute: {subgroup}"
+        self._subgroup_choices: List[str] = subgroups
+        self._correct_subgroup_index: int = subgroups.index(subgroup)
+        self._attribute: str = attribute  # For answer column
+        self._question: str = self.QUESTION_TEMPLATE.format(attribute=attribute)  # What text to prompt the model?
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        for split in [TRAIN_SPLIT, VALID_SPLIT]:
+            for row in tqdm(
+                load_dataset(
+                    self.HUGGINGFACE_DATASET_NAME,
+                    "1.25",
+                    split="validation" if split == VALID_SPLIT else split,
+                    cache_dir=output_path,
+                )
+            ):
+                # Filter out rows that do not match the subgroup
+                if row[self._attribute] != self._correct_subgroup_index:
+                    continue
+                # Save the image to disk
+                image = row["image"]
+                image_file_name: str = generate_hash(image) + ".jpg"
+                local_image_path: str = os.path.join(output_path, image_file_name)
+                if not os.path.exists(local_image_path):
+                    image.save(local_image_path)
+                content: List[MediaObject] = [
+                    MediaObject(location=local_image_path, content_type="image/jpeg"),
+                    MediaObject(text=self._question, content_type="text/plain"),
+                ]
+                references: List[Reference] = [
+                    Reference(
+                        output=Output(text=subgroup),
+                        tags=[CORRECT_TAG] if i == self._correct_subgroup_index else [],
+                    )
+                    for i, subgroup in enumerate(self._subgroup_choices)
+                ]
+                instances.append(
+                    Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=split)
+                )
+        return instances

helm/benchmark/scenarios/vision_language/flickr30k_scenario.py CHANGED Viewed

@@ -41,7 +41,7 @@ class Flickr30KScenario(Scenario):
     name = "flickr30k"
     description = (
         "An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr "
-        "images ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))."
+        "images ([Young et al., 2014](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))."
     )
     tags = ["vision-language"]

helm/benchmark/scenarios/vision_language/gqa_scenario.py CHANGED Viewed

@@ -31,7 +31,7 @@ class GQAScenario(Scenario):
     }
     Paper: https://arxiv.org/abs/1902.09506
-    Website: https://github.com/stanford-crfm/helm/issues/1951
+    Website: https://cs.stanford.edu/people/dorarad/gqa/about.html
     """
     QUESTIONS_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip"
@@ -40,7 +40,7 @@ class GQAScenario(Scenario):
     name = "gqa"
     description = (
         "Questions about real-world visual reasoning and compositional QA "
-        "([paper](https://arxiv.org/abs/1902.09506))."
+        "([Hudson and Manning, 2019](https://arxiv.org/abs/1902.09506))."
     )
     tags = ["vision-language", "reasoning"]

helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py CHANGED Viewed

@@ -49,7 +49,7 @@ class HatefulMemesScenario(Scenario):
     QUESTION: str = "Is this meme mean?"
     name = "hateful_memes"
-    description = "Identify if a meme is hateful or not ([paper](https://arxiv.org/abs/2005.04790))."
+    description = "Identify if a meme is hateful or not ([Kiela et al., 2020](https://arxiv.org/abs/2005.04790))."
     tags = ["vision-language"]
     def get_instances(self, output_path: str) -> List[Instance]:

helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py RENAMED Viewed

@@ -20,7 +20,7 @@ class Chart2CSVScenario(Scenario):
     name = "chart2csv"
     description = "Convert a chart to CSV."
-    tags = ["vision-language", "image2structure"]
+    tags = ["vision-language", "image2struct"]
     def get_instances(self, output_path: str) -> List[Instance]:
         assert os.path.exists(output_path), f"Dataset does not exist at {output_path}"

crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl