PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/live_qa_annotator.py +84 -0
helm/benchmark/annotation/medication_qa_annotator.py +81 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +16 -2
helm/benchmark/run_expander.py +77 -0
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
helm/benchmark/run_specs/experimental_run_specs.py +33 -0
helm/benchmark/run_specs/finance_run_specs.py +33 -0
helm/benchmark/run_specs/vlm_run_specs.py +168 -45
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_finance.yaml +143 -0
helm/benchmark/static/schema_image2structure.yaml +254 -111
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_tables.yaml +200 -0
helm/benchmark/static/schema_thai.yaml +223 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/anthropic_client.py +43 -9
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +71 -12
helm/clients/openai_client.py +9 -2
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +3 -3
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +84 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +19 -0
helm/config/model_deployments.yaml +412 -18
helm/config/model_metadata.yaml +447 -25
helm/config/tokenizer_configs.yaml +93 -1
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/services/server_service.py +1 -1
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +44 -2
helm/tokenizers/huggingface_tokenizer.py +36 -13
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0

helm/benchmark/run.py CHANGED Viewed

@@ -264,6 +264,13 @@ def main():
         default=None,
         help="Full class name of the Runner class to use. If unset, uses the default Runner.",
     )
+    parser.add_argument(
+        "--openvino",
+        action="store_true",
+        default=False,
+        help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
+        "specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
+    )
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
@@ -275,12 +282,19 @@ def main():
         from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
         for huggingface_model_name in args.enable_huggingface_models:
-            register_huggingface_hub_model_from_flag_value(huggingface_model_name)
+            if args.openvino:
+                register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
+            else:
+                register_huggingface_hub_model_from_flag_value(huggingface_model_name)
     if args.enable_local_huggingface_models:
         from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
         for huggingface_model_path in args.enable_local_huggingface_models:
-            register_huggingface_local_model_from_flag_value(huggingface_model_path)
+            if args.openvino:
+                register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
+            else:
+                register_huggingface_local_model_from_flag_value(huggingface_model_path)
     run_entries: List[RunEntry] = []
     if args.conf_paths:

helm/benchmark/run_expander.py CHANGED Viewed

@@ -194,6 +194,15 @@ class StopRunExpander(RunExpander):
         self.value = value
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if self.value == "none":
+            return [
+                replace(
+                    run_spec,
+                    name=f"{run_spec.name},{self.name}={self.value}",
+                    adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
+                ),
+            ]
         if self.value == "hash":
             stop = "###"
         elif self.value == "semicolon":
@@ -1035,6 +1044,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
     "chinese": {"chinese": [translate(language_code="zh-CN")]},
     "hindi": {"hindi": [translate(language_code="hi")]},
     "spanish": {"spanish": [translate(language_code="es")]},
+    "swahili": {"swahili": [translate(language_code="sw")]},
     # Styles
     "art": {
         "art": [
@@ -1380,6 +1390,72 @@ class ChatMLRunExpander(RunExpander):
         ]
+class OutputFormatInstructions(RunExpander):
+    """Add extra instructions to about output formatting to HELM Lite scenarios.
+    Many instruction-following models and chat models are tuned to expect conversational prompts
+    and respond in a conversational way. These models occasionally produce outputs that are not
+    in the expected format. This run expander instructs these models to provide the output in
+    the format expected by the scenario.
+    The argument should be the name of the scenario."""
+    name = "output_format_instructions"
+    def __init__(self, scenario: str):
+        self.scenario = scenario
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
+            if self.scenario == "mmlu_only_last_question":
+                instructions = "Answer only the last question with only a single letter."
+            else:
+                instructions = "Answer with only a single letter."
+            if run_spec.adapter_spec.instructions:
+                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
+            return [
+                replace(
+                    run_spec,
+                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+                ),
+            ]
+        elif run_spec.adapter_spec.method == ADAPT_GENERATION:
+            output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
+            if self.scenario == "narrative_qa":
+                instructions = (
+                    "Answer with one word, a few-word phrase, or a short sentence. "
+                    + "Avoid extra, unnecessary information in the answer."
+                )
+            elif self.scenario == "natural_qa":
+                instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
+            elif self.scenario == "legalbench":
+                if output_noun != "Answer":
+                    instructions = f"Answer with the {output_noun.lower()}."
+                else:
+                    instructions = "Answer yes or no."
+            elif self.scenario == "wmt_14":
+                instructions = "Answer with the English translation."
+            else:
+                raise ValueError(f"Unknown scenario {self.scenario}")
+            if run_spec.adapter_spec.output_prefix:
+                instructions = (
+                    f"{instructions} Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
+                )
+            if run_spec.adapter_spec.instructions:
+                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
+            else:
+                instructions = f"{instructions}\n"
+            return [
+                replace(
+                    run_spec,
+                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+                ),
+            ]
+        raise ValueError(f"Unknown scenario {self.scenario}")
 RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     InstructionsRunExpander,
     PromptRunExpander,
@@ -1402,6 +1478,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     NumOutputTokensRunExpander,
     ChatMLRunExpander,
     EvalSplitRunExpander,
+    OutputFormatInstructions,
 ]

helm/benchmark/run_spec_factory.py CHANGED Viewed

@@ -156,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
             increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
             run_spec = singleton(increase_temperature_expander.expand(run_spec))
+        # MedLM-Large
+        if run_spec.adapter_spec.model == "google/medlm-large":
+            run_spec = singleton(StopRunExpander("none").expand(run_spec))
         return run_spec
     run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]

helm/benchmark/run_specs/air_bench_run_specs.py ADDED Viewed

@@ -0,0 +1,40 @@
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("air_bench_2024")
+def get_air_bench_2024_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="air_bench_2024",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["air_bench_2024"],
+    )

helm/benchmark/run_specs/classic_run_specs.py CHANGED Viewed

@@ -24,6 +24,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
     get_ranking_binary_adapter_spec,
     get_summarization_adapter_spec,
 )
+from helm.benchmark.annotation.annotator import AnnotatorSpec
 from helm.benchmark.metrics.common_metric_specs import (
     get_basic_metric_specs,
     get_bias_metric_specs,
@@ -1166,8 +1167,6 @@ def get_pubmed_qa_spec() -> RunSpec:
 @run_spec_function("live_qa")
 def get_live_qa_spec() -> RunSpec:
-    from helm.common.gpu_utils import get_torch_device_name
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
     adapter_spec = get_generation_adapter_spec(
@@ -1177,22 +1176,23 @@ def get_live_qa_spec() -> RunSpec:
         max_train_instances=0,
         max_tokens=512,
     )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
+    metric_specs = get_open_ended_generation_metric_specs() + [
+        MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
+    ]
     return RunSpec(
         name="live_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_summarization_metric_specs(
-            {"task": "live_qa", "device": get_torch_device_name()},
-        ),
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
         groups=["live_qa"],
     )
 @run_spec_function("medication_qa")
 def get_medication_qa_spec() -> RunSpec:
-    from helm.common.gpu_utils import get_torch_device_name
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
     adapter_spec = get_generation_adapter_spec(
@@ -1203,13 +1203,17 @@ def get_medication_qa_spec() -> RunSpec:
         max_tokens=512,
     )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
+    ]
+    metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
     return RunSpec(
         name="medication_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_summarization_metric_specs(
-            {"task": "medication_qa", "device": get_torch_device_name()},
-        ),
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
         groups=["medication_qa"],
     )
@@ -1506,5 +1510,5 @@ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_J
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["thai_exam"],
+        groups=["thai_exam", f"thai_exam_{exam}"],
     )

helm/benchmark/run_specs/decodingtrust_run_specs.py CHANGED Viewed

@@ -309,6 +309,8 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
         name="decodingtrust_toxicity_prompts",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
+        metric_specs=get_generative_harms_metric_specs(
+            include_basic_metrics=True, include_generative_harms_metrics=True
+        ),
         groups=["decodingtrust", "toxicity_prompts"],
     )

helm/benchmark/run_specs/experimental_run_specs.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Run specs for experiments only.
+These run specs are not intended for use with public leaderboards."""
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("ci_mcqa")
+def get_ci_mcqa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ci_mcqa_scenario.CIMCQAScenario", args={})
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions=(
+            "Give a letter answer among the options given. "
+            "For example, if the options are A, B, C, D, E, and F, "
+            "your answer should consist of the single letter that corresponds to the correct answer."
+        ),
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name="ci_mcqa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["CIMCQA"],
+    )

helm/benchmark/run_specs/finance_run_specs.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Run spec functions for the HELM Finance leaderboard.
+Website: https://crfm.stanford.edu/helm/finance/"""
+from helm.benchmark.adaptation.common_adapter_specs import (
+    get_generation_adapter_spec,
+)
+from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
+)
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("fin_qa")
+def get_fin_qa_spec() -> RunSpec:
+    from helm.benchmark.scenarios.fin_qa_scenario import INSTRUCTIONS
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.fin_qa_scenario.FinQAScenario", args={})
+    adapter_spec = get_generation_adapter_spec(
+        instructions=INSTRUCTIONS, input_noun=None, output_noun="Program", max_tokens=100
+    )
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(class_name="helm.benchmark.metrics.fin_qa_metrics.FinQAMetric")
+    ]
+    return RunSpec(
+        name="fin_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["fin_qa"],
+    )

crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl