PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
helm/benchmark/adaptation/common_adapter_specs.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
helm/benchmark/annotation/call_center_annotator.py +247 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +68 -0
helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
helm/benchmark/annotation/live_qa_annotator.py +71 -0
helm/benchmark/annotation/medication_qa_annotator.py +68 -0
helm/benchmark/annotation/model_as_judge.py +45 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
helm/benchmark/annotation/xstest_annotator.py +110 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/bhasa_metrics.py +188 -0
helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
helm/benchmark/metrics/code_metrics_helper.py +11 -1
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/safety_metrics.py +57 -0
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
helm/benchmark/metrics/vision_language/image_utils.py +1 -1
helm/benchmark/model_metadata_registry.py +3 -3
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_run_entry.py +1 -0
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +31 -2
helm/benchmark/run_expander.py +113 -10
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
helm/benchmark/run_specs/call_center_run_specs.py +152 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
helm/benchmark/run_specs/experimental_run_specs.py +85 -0
helm/benchmark/run_specs/finance_run_specs.py +110 -0
helm/benchmark/run_specs/safety_run_specs.py +154 -0
helm/benchmark/run_specs/vlm_run_specs.py +251 -57
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
helm/benchmark/scenarios/banking77_scenario.py +51 -0
helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
helm/benchmark/scenarios/financebench_scenario.py +53 -0
helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
helm/benchmark/scenarios/scenario.py +1 -1
helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +2 -8
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +1 -6
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_bhasa.yaml +709 -0
helm/benchmark/static/schema_call_center.yaml +232 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +189 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_safety.yaml +247 -0
helm/benchmark/static/schema_tables.yaml +317 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/test_openai_window_service.py +8 -8
helm/clients/ai21_client.py +71 -1
helm/clients/anthropic_client.py +50 -28
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +79 -19
helm/clients/nvidia_nim_client.py +35 -0
helm/clients/openai_client.py +11 -5
helm/clients/palmyra_client.py +25 -0
helm/clients/perspective_api_client.py +11 -6
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +7 -9
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/open_flamingo_client.py +1 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +99 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +25 -0
helm/common/mongo_key_value_store.py +2 -1
helm/common/request.py +16 -0
helm/config/model_deployments.yaml +740 -363
helm/config/model_metadata.yaml +824 -128
helm/config/tokenizer_configs.yaml +207 -10
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/example_queries.py +14 -21
helm/proxy/services/server_service.py +2 -3
helm/proxy/token_counters/test_auto_token_counter.py +2 -2
helm/tokenizers/ai21_tokenizer.py +51 -59
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +29 -62
helm/tokenizers/huggingface_tokenizer.py +35 -13
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/general.js +0 -122
helm/benchmark/static/images/crfm-logo.png +0 -0
helm/benchmark/static/images/helm-logo-simple.png +0 -0
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/info-icon.png +0 -0
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/schema_image2structure.yaml +0 -304
helm/benchmark/static/utils.js +0 -285
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
helm/benchmark/window_services/ai21_window_service.py +0 -247
helm/benchmark/window_services/cohere_window_service.py +0 -101
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -75
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -327
helm/tokenizers/ice_tokenizer.py +0 -30
helm/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0

helm/benchmark/run_specs/call_center_run_specs.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""Run specs for experiments only.
+These run specs are not intended for use with public leaderboards."""
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("call_center_summarization")
+def get_call_center_summarization_spec(subset: str = "summarization") -> RunSpec:
+    from helm.benchmark.annotation.call_center_annotator import CallCenterSummarizationAnnotator
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
+        args={"subset": subset},
+    )
+    instructions = "Summarize the call transcript in under 10 sentences."
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions=instructions,
+        input_prefix="### Call Transcript\n",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        max_train_instances=0,
+        temperature=0.0,
+        max_tokens=512,
+        num_outputs=1,
+    )
+    annotator_specs = annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator")
+    ]
+    annotation_metric_specs = [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
+            args={
+                "annotator_name": CallCenterSummarizationAnnotator.name,
+                "key": criterion,
+                "min_score": 1,
+                "max_score": 5,
+            },
+        )
+        for criterion in CallCenterSummarizationAnnotator.CRITERIA
+    ]
+    metric_specs = get_basic_metric_specs([]) + annotation_metric_specs
+    group = "call_center_summarization" if subset == "summarization" else f"call_center_summarization_{subset}"
+    return RunSpec(
+        name="call_center_summarization",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=[group],
+    )
+@run_spec_function("call_center_summarization_pairwise_comparison")
+def get_call_center_summarization_pairwise_comparison_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
+    )
+    instructions = "Summarize the call transcript in under 10 sentences."
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions=instructions,
+        input_prefix="### Call Transcript\n",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        max_train_instances=0,
+        temperature=0.0,
+        max_tokens=512,
+        num_outputs=1,
+    )
+    annotator_specs = annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator"  # noqa: E501
+        )
+    ]
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
+            args={"annotator_name": "call_center_summarization_pairwise_comparison", "key": "score"},
+        )
+    ]
+    return RunSpec(
+        name="call_center_summarization_pairwise_comparison",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["call_center_summarization_pairwise_comparison"],
+    )
+@run_spec_function("call_center_summarization_key_points_recall")
+def get_call_center_summarization_key_points_recall_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
+    )
+    instructions = "Summarize the call transcript in under 10 sentences."
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions=instructions,
+        input_prefix="### Call Transcript\n",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        max_train_instances=0,
+        temperature=0.0,
+        max_tokens=512,
+        num_outputs=1,
+    )
+    annotator_specs = annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator"
+        )
+    ]
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
+            args={"annotator_name": "call_center_summarization_key_points_recall", "key": "score"},
+        )
+    ]
+    return RunSpec(
+        name="call_center_summarization_key_points_recall",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["call_center_summarization_key_points_recall"],
+    )

helm/benchmark/run_specs/classic_run_specs.py CHANGED Viewed

@@ -24,6 +24,7 @@ from helm.benchmark.adaptation.common_adapter_specs import (
     get_ranking_binary_adapter_spec,
     get_summarization_adapter_spec,
 )
+from helm.benchmark.annotation.annotator import AnnotatorSpec
 from helm.benchmark.metrics.common_metric_specs import (
     get_basic_metric_specs,
     get_bias_metric_specs,
@@ -1166,8 +1167,6 @@ def get_pubmed_qa_spec() -> RunSpec:
 @run_spec_function("live_qa")
 def get_live_qa_spec() -> RunSpec:
-    from helm.common.gpu_utils import get_torch_device_name
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
     adapter_spec = get_generation_adapter_spec(
@@ -1177,22 +1176,23 @@ def get_live_qa_spec() -> RunSpec:
         max_train_instances=0,
         max_tokens=512,
     )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.live_qa_annotator.LiveQAAnnotator")]
+    metric_specs = get_open_ended_generation_metric_specs() + [
+        MetricSpec(class_name="helm.benchmark.metrics.live_qa_metrics.LiveQAScoreMetric")
+    ]
     return RunSpec(
         name="live_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_summarization_metric_specs(
-            {"task": "live_qa", "device": get_torch_device_name()},
-        ),
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
         groups=["live_qa"],
     )
 @run_spec_function("medication_qa")
 def get_medication_qa_spec() -> RunSpec:
-    from helm.common.gpu_utils import get_torch_device_name
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
     adapter_spec = get_generation_adapter_spec(
@@ -1203,13 +1203,17 @@ def get_medication_qa_spec() -> RunSpec:
         max_tokens=512,
     )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
+    ]
+    metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
     return RunSpec(
         name="medication_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_summarization_metric_specs(
-            {"task": "medication_qa", "device": get_torch_device_name()},
-        ),
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
         groups=["medication_qa"],
     )
@@ -1506,5 +1510,5 @@ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_J
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["thai_exam"],
+        groups=["thai_exam", f"thai_exam_{exam}"],
     )

helm/benchmark/run_specs/decodingtrust_run_specs.py CHANGED Viewed

@@ -56,7 +56,7 @@ def get_decodingtrust_stereotype_bias_spec(task: str) -> RunSpec:
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_stereotype_bias_metric_specs(),
-        groups=["decodingtrust", "stereotype_bias"],
+        groups=["decodingtrust_stereotype_bias", "stereotype_bias"],
     )
@@ -74,7 +74,7 @@ def get_decodingtrust_adv_robustness_spec(task: str) -> RunSpec:
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["decodingtrust", "adv_robustness"],
+        groups=["decodingtrust_adv_robustness", "adv_robustness"],
     )
@@ -92,7 +92,7 @@ def get_decodingtrust_adv_demonstration_spec(perspective: str, data: str, demo_n
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["decodingtrust", "adv_demonstration"],
+        groups=["decodingtrust_adv_demonstration", "adv_demonstration"],
     )
@@ -137,7 +137,7 @@ def get_decodingtrust_ood_robustness_spec(
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() if ood_type == "style" else get_ood_knowledge_metric_specs(),
-        groups=["decodingtrust", "ood_robustness"],
+        groups=["decodingtrust_ood_robustness", "ood_robustness"],
     )
@@ -163,7 +163,7 @@ def get_decodingtrust_fairness_spec(
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_fairness_metric_specs() + get_exact_match_metric_specs(),
-        groups=["decodingtrust", "fairness"],
+        groups=["decodingtrust_fairness", "fairness"],
     )
@@ -192,7 +192,7 @@ def get_decodingtrust_privacy_spec(
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_privacy_metric_specs(),
-        groups=["decodingtrust", "privacy"],
+        groups=["decodingtrust_privacy", "privacy"],
     )
@@ -280,7 +280,7 @@ def get_decodingtrust_machine_ethics_spec(
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["decodingtrust", "machine_ethics"],
+        groups=["decodingtrust_machine_ethics", "machine_ethics"],
     )
@@ -309,6 +309,8 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
         name="decodingtrust_toxicity_prompts",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
-        groups=["decodingtrust", "toxicity_prompts"],
+        metric_specs=get_generative_harms_metric_specs(
+            include_basic_metrics=True, include_generative_harms_metrics=True
+        ),
+        groups=["decodingtrust_toxicity_prompts", "toxicity_prompts"],
     )

helm/benchmark/run_specs/experimental_run_specs.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Run specs for experiments only.
+These run specs are not intended for use with public leaderboards."""
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("ci_mcqa")
+def get_ci_mcqa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ci_mcqa_scenario.CIMCQAScenario", args={})
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions=(
+            "Give a letter answer among the options given. "
+            "For example, if the options are A, B, C, D, E, and F, "
+            "your answer should consist of the single letter that corresponds to the correct answer."
+        ),
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name="ci_mcqa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["CIMCQA"],
+    )
+@run_spec_function("ewok")
+def get_ewok_spec(domain: str = "all") -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.ewok_scenario.EWoKScenario", args={"domain": domain}
+    )
+    instructions = """# INSTRUCTIONS
+In this study, you will see multiple examples. In each example, you will be given two contexts and a scenario. Your task is to read the two contexts and the subsequent scenario, and pick the context that makes more sense considering the scenario that follows. The contexts will be numbered "1" or "2". You must answer using "1" or "2" in your response.
+"""  # noqa: E501
+    input_prefix = """# TEST EXAMPLE
+## Scenario
+\""""
+    input_suffix = """\"
+## Contexts
+"""
+    output_prefix = """
+## Task
+Which context makes more sense given the scenario? Please answer using either "1" or "2".
+## Response
+"""
+    adapter_spec = AdapterSpec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions=instructions,
+        input_prefix=input_prefix,
+        input_suffix=input_suffix,
+        reference_prefix='1. "',
+        reference_suffix='"\n',
+        output_prefix=output_prefix,
+        output_suffix="\n",
+        max_train_instances=2,
+        num_outputs=1,
+        max_tokens=2,
+        temperature=0.0,
+        stop_sequences=["\n\n"],
+        sample_train=True,
+    )
+    return RunSpec(
+        name=f"ewok:domain={domain}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["ewok", f"ewok_{domain}"],
+    )

helm/benchmark/run_specs/finance_run_specs.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""Run spec functions for the HELM Finance leaderboard.
+Website: https://crfm.stanford.edu/helm/finance/"""
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.adaptation.common_adapter_specs import (
+    get_generation_adapter_spec,
+)
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
+    get_exact_match_metric_specs,
+)
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.runner import get_benchmark_output_path
+from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
+@run_spec_function("fin_qa")
+def get_fin_qa_spec() -> RunSpec:
+    from helm.benchmark.scenarios.fin_qa_scenario import INSTRUCTIONS
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.fin_qa_scenario.FinQAScenario", args={})
+    adapter_spec = get_generation_adapter_spec(
+        instructions=INSTRUCTIONS, input_noun=None, output_noun="Program", max_tokens=100
+    )
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(class_name="helm.benchmark.metrics.fin_qa_metrics.FinQAMetric")
+    ]
+    return RunSpec(
+        name="fin_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["fin_qa"],
+    )
+@run_spec_function("financebench")
+def get_financebench_spec() -> RunSpec:
+    instructions = (
+        "Answer only the last question using the given evidence. "
+        "Respond with only a single paragraph, sentence or sentence fragment.\n"
+    )
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.financebench_scenario.FinanceBenchScenario", args={}
+    )
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions=instructions,
+        input_prefix="\n",
+        input_suffix="\n",
+        output_prefix="\nAnswer: ",
+        output_suffix="\n",
+        instance_prefix="\n###\n",
+        num_outputs=1,
+        max_tokens=300,
+        temperature=0.0,
+        stop_sequences=["###"],
+    )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.financebench_annotator.FinanceBenchAnnotator")
+    ]
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.annotation_metrics.AnnotationLabelMetric",
+            args={
+                "annotator_name": "financebench",
+                "key": "label",
+                "labels": ["correct_answer", "incorrect_answer", "failure_to_answer"],
+            },
+        )
+    ]
+    return RunSpec(
+        name="financebench",
+        scenario_spec=scenario_spec,
+        annotators=annotator_specs,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["financebench"],
+    )
+@run_spec_function("banking77")
+def get_banking77_spec() -> RunSpec:
+    from helm.benchmark.scenarios.raft_scenario import get_raft_instructions
+    from helm.benchmark.scenarios.banking77_scenario import Banking77Scenario
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.banking77_scenario.Banking77Scenario", args={})
+    # Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), Banking77Scenario.name)
+    adapter_spec = get_generation_adapter_spec(
+        instructions=get_raft_instructions("banking_77", scenario_cache_path),
+        input_noun=None,
+        output_noun="Label",
+        max_tokens=30,  # at most ~50 characters per label
+    )
+    # Not using get_classification_metric_specs() / ClassificationMetric because BANKING77 has too many classes,
+    # so F1 scores don't make sense. The original paper uses accuracy instead.
+    metric_specs = get_exact_match_metric_specs()
+    return RunSpec(
+        name="banking77",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["banking77"],
+    )

helm/benchmark/run_specs/safety_run_specs.py ADDED Viewed

@@ -0,0 +1,154 @@
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+from helm.benchmark.metrics.metric import MetricSpec
+@run_spec_function("harm_bench")
+def get_harm_bench_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.harm_bench_scenario.HarmBenchScenario")
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.harm_bench_annotator.HarmBenchAnnotator")]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="harm_bench",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["harm_bench"],
+    )
+@run_spec_function("simple_safety_tests")
+def get_simple_safety_tests_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.simple_safety_tests_scenario.SimpleSafetyTestsScenario"
+    )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.simple_safety_tests_annotator.SimpleSafetyTestsAnnotator")
+    ]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="simple_safety_tests",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["simple_safety_tests"],
+    )
+@run_spec_function("xstest")
+def get_xstest_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.xstest_scenario.XSTestScenario")
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.xstest_annotator.XSTestAnnotator")]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="xstest",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["xstest"],
+    )
+@run_spec_function("anthropic_red_team")
+def get_anthropic_red_team_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.anthropic_red_team_scenario.AnthropicRedTeamScenario"
+    )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.anthropic_red_team_annotator.AnthropicRedTeamAnnotator")
+    ]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="anthropic_red_team",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["anthropic_red_team"],
+    )

crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl