crfm-helm 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +81 -112
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +165 -155
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +37 -45
- helm/benchmark/annotation/medication_qa_annotator.py +36 -44
- helm/benchmark/annotation/model_as_judge.py +96 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +79 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +17 -3
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +106 -256
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +83 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +652 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +82 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +100 -24
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1942 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/raft_scenario.py +1 -1
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_safety.yaml +266 -0
- helm/benchmark/static/schema_tables.yaml +149 -8
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +137 -101
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +18 -4
- helm/clients/palmyra_client.py +24 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/together_client.py +22 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/cache.py +8 -30
- helm/common/images_utils.py +6 -0
- helm/common/key_value_store.py +9 -9
- helm/common/mongo_key_value_store.py +5 -4
- helm/common/request.py +16 -0
- helm/common/test_cache.py +1 -48
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +444 -329
- helm/config/model_metadata.yaml +513 -111
- helm/config/tokenizer_configs.yaml +140 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/server.py +0 -9
- helm/proxy/services/remote_service.py +0 -6
- helm/proxy/services/server_service.py +6 -20
- helm/proxy/services/service.py +0 -6
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/{data_overlap → scenarios/vision_language/image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct/webpage}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
CORRECT_TAG,
|
|
7
|
+
Output,
|
|
8
|
+
Reference,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
Input,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import ensure_directory_exists
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CallCenterSummarizationScenario(Scenario):
|
|
18
|
+
"""Call center summarization."""
|
|
19
|
+
|
|
20
|
+
name = "call_center_summarization"
|
|
21
|
+
description = "Call center summarization."
|
|
22
|
+
tags = ["call_center"]
|
|
23
|
+
|
|
24
|
+
def __init__(self, subset: str):
|
|
25
|
+
super().__init__()
|
|
26
|
+
self.subset = subset
|
|
27
|
+
|
|
28
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
29
|
+
cache_dir = os.path.join(output_path, "data")
|
|
30
|
+
ensure_directory_exists(cache_dir)
|
|
31
|
+
dataset = datasets.load_dataset("yifanmai/call-center", self.subset, split="test", cache_dir=cache_dir)
|
|
32
|
+
instances: List[Instance] = []
|
|
33
|
+
for row in dataset:
|
|
34
|
+
input = Input(text=row["transcript"])
|
|
35
|
+
instance = Instance(input=input, references=[], split=TEST_SPLIT)
|
|
36
|
+
instances.append(instance)
|
|
37
|
+
return instances
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class CallCenterSummarizationPairwiseComparisonScenario(Scenario):
|
|
41
|
+
"""Call center summarization."""
|
|
42
|
+
|
|
43
|
+
name = "call_center_summarization_pairwise_comparison"
|
|
44
|
+
description = "Call center summarization."
|
|
45
|
+
tags = ["call_center"]
|
|
46
|
+
|
|
47
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
48
|
+
cache_dir = os.path.join(output_path, "data")
|
|
49
|
+
ensure_directory_exists(cache_dir)
|
|
50
|
+
dataset = datasets.load_dataset(
|
|
51
|
+
"yifanmai/call-center", "summarization_with_annotations", split="test", cache_dir=cache_dir
|
|
52
|
+
)
|
|
53
|
+
instances: List[Instance] = []
|
|
54
|
+
for row in dataset:
|
|
55
|
+
input = Input(text=row["transcript"])
|
|
56
|
+
reference = Reference(output=Output(text=row["gpt-4o-mini-2024-07-18_summary"]), tags=[CORRECT_TAG])
|
|
57
|
+
instance = Instance(input=input, references=[reference], split=TEST_SPLIT)
|
|
58
|
+
instances.append(instance)
|
|
59
|
+
return instances
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class CallCenterSummarizationKeyPointsRecallScenario(Scenario):
|
|
63
|
+
"""Call center summarization."""
|
|
64
|
+
|
|
65
|
+
name = "call_center_summarization_key_points_recall"
|
|
66
|
+
description = "Call center summarization."
|
|
67
|
+
tags = ["call_center"]
|
|
68
|
+
|
|
69
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
+
cache_dir = os.path.join(output_path, "data")
|
|
71
|
+
ensure_directory_exists(cache_dir)
|
|
72
|
+
dataset = datasets.load_dataset(
|
|
73
|
+
"yifanmai/call-center", "summarization_with_annotations", split="test", cache_dir=cache_dir
|
|
74
|
+
)
|
|
75
|
+
instances: List[Instance] = []
|
|
76
|
+
for row in dataset:
|
|
77
|
+
input = Input(text=row["transcript"])
|
|
78
|
+
references = [
|
|
79
|
+
Reference(output=Output(text=key_point), tags=[CORRECT_TAG])
|
|
80
|
+
for key_point in row["gpt-4o-mini-2024-07-18_key_points"]
|
|
81
|
+
]
|
|
82
|
+
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
83
|
+
instances.append(instance)
|
|
84
|
+
return instances
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from typing import List, Dict
|
|
4
|
-
from .scenario import Scenario, Instance, Input
|
|
4
|
+
from .scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
6
|
from helm.benchmark.scenarios.scenario import Reference, Output
|
|
7
7
|
|
|
@@ -61,6 +61,7 @@ class DecodingTrustStereotypeBiasScenario(Scenario):
|
|
|
61
61
|
tags=[stereotype_topic_tag, demographic_group_tag, sys_prompt_type_tag],
|
|
62
62
|
)
|
|
63
63
|
],
|
|
64
|
+
split=TEST_SPLIT,
|
|
64
65
|
)
|
|
65
66
|
instances.append(instance)
|
|
66
67
|
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
CORRECT_TAG,
|
|
7
|
+
TEST_SPLIT,
|
|
8
|
+
TRAIN_SPLIT,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.general import ensure_directory_exists
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EWoKScenario(Scenario):
|
|
19
|
+
"""Elements of World Knowledge (EWoK)
|
|
20
|
+
|
|
21
|
+
Elements of World Knowledge (EWoK) is a framework for evaluating world modeling in
|
|
22
|
+
language models by testing their ability to use knowledge of a concept to match a
|
|
23
|
+
target text with a plausible/implausible context. EWoK targets specific concepts
|
|
24
|
+
from multiple knowledge domains known to be vital for world modeling in humans.
|
|
25
|
+
Domains range from social interactions (help/hinder) to spatial relations (left/right).
|
|
26
|
+
Both, contexts and targets are minimal pairs. Objects, agents, and locations in the items
|
|
27
|
+
can be flexibly filled in enabling easy generation of multiple controlled datasets.
|
|
28
|
+
|
|
29
|
+
EWoK-CORE-1.0 is a dataset of 4,374 items covering 11 world knowledge domains."""
|
|
30
|
+
|
|
31
|
+
name = "ewok"
|
|
32
|
+
description = (
|
|
33
|
+
"Elements of World Knowledge (EWoK) is a benchmark for evaluating world modeling by testing their ability to "
|
|
34
|
+
"use knowledge of a concept to match a target text with a plausible/implausible context."
|
|
35
|
+
)
|
|
36
|
+
tags = ["world knowledge"]
|
|
37
|
+
|
|
38
|
+
DOMAINS = [
|
|
39
|
+
"agent_properties",
|
|
40
|
+
"material_dynamics",
|
|
41
|
+
"material_properties",
|
|
42
|
+
"physical_dynamics",
|
|
43
|
+
"physical_interactions",
|
|
44
|
+
"physical_relations",
|
|
45
|
+
"quantitative_properties",
|
|
46
|
+
"social_interactions",
|
|
47
|
+
"social_properties",
|
|
48
|
+
"social_relations",
|
|
49
|
+
"spatial_relations",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
def __init__(self, domain: str = "all"):
|
|
53
|
+
super().__init__()
|
|
54
|
+
if domain != "all" and domain not in EWoKScenario.DOMAINS:
|
|
55
|
+
raise Exception(f"Unknown domain '{domain}', valid domains are {EWoKScenario.DOMAINS}")
|
|
56
|
+
self.domain = domain.replace("_", "-")
|
|
57
|
+
|
|
58
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
59
|
+
next_instance_index = 0
|
|
60
|
+
|
|
61
|
+
def generate_instance_id() -> str:
|
|
62
|
+
nonlocal next_instance_index
|
|
63
|
+
instance_id = f"id{next_instance_index}"
|
|
64
|
+
next_instance_index += 1
|
|
65
|
+
return instance_id
|
|
66
|
+
|
|
67
|
+
cache_dir = os.path.join(output_path, "data")
|
|
68
|
+
ensure_directory_exists(cache_dir)
|
|
69
|
+
|
|
70
|
+
# TODO: Switch this to the production dataset when available.
|
|
71
|
+
dataset = datasets.load_dataset(
|
|
72
|
+
"ewok-core/ewok-core-1.0",
|
|
73
|
+
split="test",
|
|
74
|
+
revision="34d912a608066c92e2990a0328ffc3bd9a716042",
|
|
75
|
+
cache_dir=cache_dir,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
instances: List[Instance] = []
|
|
79
|
+
for row in dataset:
|
|
80
|
+
contexts = [row["Context1"], row["Context2"]]
|
|
81
|
+
targets = [row["Target1"], row["Target2"]]
|
|
82
|
+
# References are category ID, followed by level 2, 3 and 4 category names.
|
|
83
|
+
for target_index, target in enumerate(targets):
|
|
84
|
+
input = Input(text=target)
|
|
85
|
+
references = [
|
|
86
|
+
Reference(output=Output(text=context), tags=[CORRECT_TAG] if context_index == target_index else [])
|
|
87
|
+
for context_index, context in enumerate(contexts)
|
|
88
|
+
]
|
|
89
|
+
instance = Instance(id=generate_instance_id(), input=input, references=references, split=TEST_SPLIT)
|
|
90
|
+
# Filtering by domain after generate instance IDs,
|
|
91
|
+
# so that instance IDs for an item is invariant regardless of domain filtering.
|
|
92
|
+
if self.domain == "all" or self.domain == row["Domain"]:
|
|
93
|
+
instances.append(instance)
|
|
94
|
+
instances.extend(
|
|
95
|
+
[
|
|
96
|
+
Instance(
|
|
97
|
+
id=generate_instance_id(),
|
|
98
|
+
input=Input(text="I drew a ball from the bag."),
|
|
99
|
+
references=[
|
|
100
|
+
Reference(output=Output(text="The bag is full of blocks."), tags=[]),
|
|
101
|
+
Reference(output=Output(text="The bag is full of balls."), tags=[CORRECT_TAG]),
|
|
102
|
+
],
|
|
103
|
+
split=TRAIN_SPLIT,
|
|
104
|
+
),
|
|
105
|
+
Instance(
|
|
106
|
+
id=generate_instance_id(),
|
|
107
|
+
input=Input(text="The boy chose to eat a cookie."),
|
|
108
|
+
references=[
|
|
109
|
+
Reference(output=Output(text="The boy likes cookies."), tags=[CORRECT_TAG]),
|
|
110
|
+
Reference(output=Output(text="The boy does not like cookies."), tags=[]),
|
|
111
|
+
],
|
|
112
|
+
split=TRAIN_SPLIT,
|
|
113
|
+
),
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
return instances
|
|
@@ -18,6 +18,8 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
18
18
|
DATASET_URL_PREFIX = "https://github.com/czyssrs/FinQA/raw/0f16e2867befa6840783e58be38c9efb9229d742/dataset/"
|
|
19
19
|
INSTRUCTIONS = """Presented with a financial report consisting of textual contents and a structured table, given a question, generate the reasoning program in the domain specific langauge (DSL) that will be executed to get the answer.
|
|
20
20
|
|
|
21
|
+
Respond with only the program in the DSL for the last question, without any preamble, elaboration, or working steps. Do not respond with anything that is not part of the DSL.
|
|
22
|
+
|
|
21
23
|
The DSL consists of mathematical operations and table operations as executable programs. The program consists of a sequence of operations. Each operation takes a list of arguments.
|
|
22
24
|
|
|
23
25
|
There are 6 mathematical operations: add, subtract, multiply, divide, greater, exp, and 4 table aggregation operations table-max, table-min, table-sum, table-average, that apply aggregation operations on table rows. The mathematical operations take arguments of either numbers from the given reports, or a numerical result from a previous step.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import random
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FinanceBenchScenario(Scenario):
|
|
21
|
+
"""FinanceBench"""
|
|
22
|
+
|
|
23
|
+
name = "financebench"
|
|
24
|
+
description = "FinanceBench"
|
|
25
|
+
tags = ["finance"]
|
|
26
|
+
|
|
27
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
28
|
+
cache_dir = os.path.join(output_path, "data")
|
|
29
|
+
ensure_directory_exists(cache_dir)
|
|
30
|
+
target_path = os.path.join(cache_dir, "financebench_open_source.jsonl")
|
|
31
|
+
url: str = (
|
|
32
|
+
"https://raw.githubusercontent.com/patronus-ai/financebench/d7beebe5e739e0b806ab4443c1b3e23f51804acf/data/financebench_open_source.jsonl" # noqa: E501
|
|
33
|
+
)
|
|
34
|
+
ensure_file_downloaded(source_url=url, target_path=target_path)
|
|
35
|
+
|
|
36
|
+
instances: List[Instance] = []
|
|
37
|
+
with open(target_path) as f:
|
|
38
|
+
for line in f:
|
|
39
|
+
row = json.loads(line)
|
|
40
|
+
instance_id = row["financebench_id"]
|
|
41
|
+
question = row["question"]
|
|
42
|
+
answer = row["answer"]
|
|
43
|
+
evidence = row["evidence"][0]["evidence_text_full_page"]
|
|
44
|
+
input_text = f"Evidence: {evidence}\nQuestion: {question}"
|
|
45
|
+
input = Input(text=input_text)
|
|
46
|
+
references = [Reference(output=Output(text=answer), tags=[CORRECT_TAG])]
|
|
47
|
+
instance = Instance(id=instance_id, input=input, references=references, split=TEST_SPLIT)
|
|
48
|
+
instances.append(instance)
|
|
49
|
+
random.seed(0)
|
|
50
|
+
train_indexes = random.sample(list(range(len(instances))), k=10)
|
|
51
|
+
for train_index in train_indexes:
|
|
52
|
+
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
|
|
53
|
+
return instances
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
|
|
7
|
+
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HarmBenchScenario(Scenario):
|
|
11
|
+
"""
|
|
12
|
+
HarmBench is a standardized evaluation framework for automated red teaming.
|
|
13
|
+
HarmBench identifies key considerations previously unaccounted for in red teaming
|
|
14
|
+
evaluations and systematically designed prompts that meet these criteria.
|
|
15
|
+
|
|
16
|
+
https://arxiv.org/abs/2402.04249
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
name = "harm_bench"
|
|
20
|
+
description = "HarmBench dataset - standard and contextual attacks"
|
|
21
|
+
tags = ["safety"]
|
|
22
|
+
|
|
23
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
24
|
+
data_path = os.path.join(output_path, "data")
|
|
25
|
+
os.makedirs(data_path, exist_ok=True)
|
|
26
|
+
url_prefix = "https://raw.githubusercontent.com/centerforaisafety/HarmBench/8e1604d1171fe8a48d8febecd22f600e462bdcdd/data/" # noqa: E501
|
|
27
|
+
outf_path = os.path.join(data_path, "harmbench_behaviors_text_all.csv")
|
|
28
|
+
ensure_file_downloaded(
|
|
29
|
+
source_url=url_prefix + "behavior_datasets/harmbench_behaviors_text_all.csv",
|
|
30
|
+
target_path=outf_path,
|
|
31
|
+
unpack=False,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
df = pd.read_csv(outf_path)
|
|
35
|
+
|
|
36
|
+
# Read all the instances
|
|
37
|
+
instances: List[Instance] = []
|
|
38
|
+
|
|
39
|
+
# we take the direct_request template from HarmBench for simplicity
|
|
40
|
+
# see the example here
|
|
41
|
+
# https://github.com/centerforaisafety/HarmBench/blob/main/baselines/direct_request/direct_request.py
|
|
42
|
+
#
|
|
43
|
+
# in the future we can use other types of templates including various red team attacks
|
|
44
|
+
for i, row in df.iterrows():
|
|
45
|
+
if row["FunctionalCategory"] == "contextual":
|
|
46
|
+
context = row["ContextString"]
|
|
47
|
+
input_text = context + "\n\n---\n\n" + row["Behavior"]
|
|
48
|
+
else:
|
|
49
|
+
input_text = row["Behavior"]
|
|
50
|
+
input = Input(text=input_text)
|
|
51
|
+
tag = row["Tags"]
|
|
52
|
+
id = row["BehaviorID"]
|
|
53
|
+
references = [Reference(output=Output(text=row["SemanticCategory"]), tags=[])]
|
|
54
|
+
if tag != "":
|
|
55
|
+
instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
|
|
56
|
+
else:
|
|
57
|
+
instance = Instance(input=input, split=TEST_SPLIT, references=references, sub_split=tag, id=id)
|
|
58
|
+
instances.append(instance)
|
|
59
|
+
return instances
|
|
@@ -40,7 +40,7 @@ def get_raft_prompt_settings(subset: str, cache_dir: str):
|
|
|
40
40
|
return field_ordering[subset], instructions[subset]
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def get_raft_instructions(subset: str, cache_dir: str):
|
|
43
|
+
def get_raft_instructions(subset: str, cache_dir: str) -> str:
|
|
44
44
|
return get_raft_prompt_settings(subset, cache_dir)[1]
|
|
45
45
|
|
|
46
46
|
|
|
@@ -25,7 +25,7 @@ DEFAULT_TEST_SIZE: int = 1000
|
|
|
25
25
|
""" Reference tags """
|
|
26
26
|
CORRECT_TAG: str = "correct"
|
|
27
27
|
|
|
28
|
-
""" Asset tags (used for compiled outputs such as
|
|
28
|
+
""" Asset tags (used for compiled outputs such as image2struct)"""
|
|
29
29
|
ASSET_NAME_TAG: str = "asset_name"
|
|
30
30
|
ASSET_PATH_TAG: str = "asset_path"
|
|
31
31
|
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from datasets import load_dataset
|
|
3
|
+
|
|
4
|
+
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SimpleSafetyTestsScenario(Scenario):
|
|
8
|
+
"""
|
|
9
|
+
The test suite comprises 100 test prompts across five harm areas that LLMs,
|
|
10
|
+
for the vast majority of applications, should refuse to comply with
|
|
11
|
+
https://huggingface.co/datasets/Bertievidgen/SimpleSafetyTests
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
name = "SimpleSafetyTests"
|
|
15
|
+
description = "SimpleSafetyTests dataset"
|
|
16
|
+
tags = ["safety"]
|
|
17
|
+
|
|
18
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
19
|
+
# read explicit data
|
|
20
|
+
dataset = load_dataset(
|
|
21
|
+
"Bertievidgen/SimpleSafetyTests", split="test", revision="98223c5d8c4059c8f4d8fe2fec8720ee8a20d3c5"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Read all the instances
|
|
25
|
+
instances: List[Instance] = []
|
|
26
|
+
for row in dataset:
|
|
27
|
+
input = Input(text=row["prompt"])
|
|
28
|
+
references = [
|
|
29
|
+
Reference(output=Output(text=row[column_name]), tags=[]) for column_name in ["category", "harm_area"]
|
|
30
|
+
]
|
|
31
|
+
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
32
|
+
instances.append(instance)
|
|
33
|
+
return instances
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.commonsense_scenario import OpenBookQA
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_openbookqa_scenario():
|
|
10
|
+
scenario = OpenBookQA()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 5457
|
|
14
|
+
assert instances[0].input == Input(text="The sun is responsible for")
|
|
15
|
+
assert instances[0].references == [
|
|
16
|
+
Reference(output=Output(text="puppies learning new tricks"), tags=[]),
|
|
17
|
+
Reference(output=Output(text="children growing up and getting old"), tags=[]),
|
|
18
|
+
Reference(output=Output(text="flowers wilting in a vase"), tags=[]),
|
|
19
|
+
Reference(output=Output(text="plants sprouting, blooming and wilting"), tags=[CORRECT_TAG]),
|
|
20
|
+
]
|
|
21
|
+
assert instances[0].split == "train"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from datasets.exceptions import DatasetNotFoundError
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.scenarios.ewok_scenario import EWoKScenario
|
|
7
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.mark.scenarios
|
|
11
|
+
def test_ewok_scenario():
|
|
12
|
+
scenario = EWoKScenario()
|
|
13
|
+
with TemporaryDirectory() as tmpdir:
|
|
14
|
+
try:
|
|
15
|
+
instances = scenario.get_instances(tmpdir)
|
|
16
|
+
except DatasetNotFoundError:
|
|
17
|
+
pytest.skip("Unable to access gated dataset on Hugging Face Hub; skipping test")
|
|
18
|
+
assert len(instances) == 8748
|
|
19
|
+
assert "believes" in instances[0].input.text
|
|
20
|
+
assert len(instances[0].references) == 2
|
|
21
|
+
assert "inside" in instances[0].references[0].output.text
|
|
22
|
+
assert instances[0].references[0].tags == [CORRECT_TAG]
|
|
23
|
+
assert "outside" in instances[0].references[1].output.text
|
|
24
|
+
assert instances[0].references[1].tags == []
|
|
25
|
+
assert instances[0].split == "test"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.financebench_scenario import FinanceBenchScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, TEST_SPLIT, TRAIN_SPLIT
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_air_2024_scenario_get_instances():
|
|
10
|
+
scenario = FinanceBenchScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 150
|
|
14
|
+
assert len([instance for instance in instances if instance.split == TRAIN_SPLIT]) == 10
|
|
15
|
+
assert (
|
|
16
|
+
"Evidence: Table of Contents \n3M Company and Subsidiaries\nConsolidated Statement of Cash Flow s\n" # noqa: E501
|
|
17
|
+
in instances[0].input.text
|
|
18
|
+
)
|
|
19
|
+
assert (
|
|
20
|
+
"Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement." # noqa: E501
|
|
21
|
+
in instances[0].input.text
|
|
22
|
+
)
|
|
23
|
+
assert len(instances[0].references) == 1
|
|
24
|
+
assert instances[0].references[0].output.text == "$1577.00"
|
|
25
|
+
assert instances[0].references[0].tags == [CORRECT_TAG]
|
|
26
|
+
assert instances[0].split == TEST_SPLIT
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.gsm_scenario import GSM8KScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Input, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_gsm_scenario_get_instances():
|
|
10
|
+
math_scenario = GSM8KScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = math_scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 8792
|
|
14
|
+
assert actual_instances[0].input == Input(
|
|
15
|
+
text=(
|
|
16
|
+
"Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many"
|
|
17
|
+
" clips did Natalia sell altogether in April and May?"
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
assert actual_instances[0].references == [
|
|
21
|
+
Reference(
|
|
22
|
+
output=Output(
|
|
23
|
+
text=(
|
|
24
|
+
"Natalia sold 48/2 = <<48/2=24>>24 clips in May. Natalia sold 48+24 = <<48+24=72>>72 clips"
|
|
25
|
+
" altogether in April and May. The answer is 72."
|
|
26
|
+
)
|
|
27
|
+
),
|
|
28
|
+
tags=["correct"],
|
|
29
|
+
)
|
|
30
|
+
]
|
|
31
|
+
assert actual_instances[0].split == "train"
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.legalbench_scenario import LegalBenchScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_legalbench_scenario():
|
|
10
|
+
scenario = LegalBenchScenario(subset="abercrombie")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 100
|
|
14
|
+
assert instances[0].input == Input(text='Description: The mark "Ivory" for a product made of elephant tusks.')
|
|
15
|
+
assert instances[0].references == [
|
|
16
|
+
Reference(output=Output(text="generic"), tags=["correct"]),
|
|
17
|
+
]
|
|
18
|
+
assert instances[0].split == "train"
|
|
19
|
+
|
|
20
|
+
scenario = LegalBenchScenario(subset="proa")
|
|
21
|
+
with TemporaryDirectory() as tmpdir:
|
|
22
|
+
instances = scenario.get_instances(tmpdir)
|
|
23
|
+
assert len(instances) == 100
|
|
24
|
+
assert instances[0].input == Input(
|
|
25
|
+
text="Statute: Amendments to pleadings must be filed within 15 days of the filing of the initial pleading."
|
|
26
|
+
)
|
|
27
|
+
assert instances[0].references == [
|
|
28
|
+
Reference(output=Output(text="No"), tags=[CORRECT_TAG]),
|
|
29
|
+
]
|
|
30
|
+
assert instances[0].split == "train"
|
|
@@ -5,18 +5,12 @@ from helm.benchmark.scenarios.math_scenario import MATHScenario
|
|
|
5
5
|
from helm.benchmark.scenarios.scenario import Input, Output, Reference
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
@pytest.mark.skip(
|
|
10
|
-
reason="Incompatible with newer versions with diffusers>0.24.0. Fails with "
|
|
11
|
-
'"Loading a dataset cached in a LocalFileSystem is not supported"'
|
|
12
|
-
)
|
|
8
|
+
@pytest.mark.scenarios
|
|
13
9
|
def test_math_scenario_get_instances():
|
|
14
10
|
math_scenario = MATHScenario(subject="number_theory", level="1")
|
|
15
11
|
with TemporaryDirectory() as tmpdir:
|
|
16
12
|
actual_instances = math_scenario.get_instances(tmpdir)
|
|
17
13
|
assert len(actual_instances) == 77
|
|
18
14
|
assert actual_instances[0].input == Input(text="What is the remainder when (99)(101) is divided by 9?")
|
|
19
|
-
assert actual_instances[0].references == [
|
|
20
|
-
Reference(output=Output(text="0", multimedia_content=None), tags=["correct"])
|
|
21
|
-
]
|
|
15
|
+
assert actual_instances[0].references == [Reference(output=Output(text="0"), tags=["correct"])]
|
|
22
16
|
assert actual_instances[0].split == "train"
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.med_qa_scenario import MedQAScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_med_qa_scenario():
|
|
10
|
+
scenario = MedQAScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 12723
|
|
14
|
+
assert instances[0].input == Input(
|
|
15
|
+
text=(
|
|
16
|
+
"A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it"
|
|
17
|
+
" started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She"
|
|
18
|
+
" otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C),"
|
|
19
|
+
" blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on"
|
|
20
|
+
" room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus."
|
|
21
|
+
" Which of the following is the best treatment for this patient?"
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
assert instances[0].references == [
|
|
25
|
+
Reference(output=Output(text="Ampicillin"), tags=[]),
|
|
26
|
+
Reference(output=Output(text="Ceftriaxone"), tags=[]),
|
|
27
|
+
Reference(output=Output(text="Doxycycline"), tags=[]),
|
|
28
|
+
Reference(output=Output(text="Nitrofurantoin"), tags=[CORRECT_TAG]),
|
|
29
|
+
]
|
|
30
|
+
assert instances[0].split == "train"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.mmlu_scenario import MMLUScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_mmlu_scenario():
|
|
10
|
+
with TemporaryDirectory() as tmpdir:
|
|
11
|
+
scenario = MMLUScenario(subject="abstract_algebra")
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 116
|
|
14
|
+
assert instances[0].input == Input(text="Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.")
|
|
15
|
+
assert instances[0].references == [
|
|
16
|
+
Reference(output=Output(text="0"), tags=[]),
|
|
17
|
+
Reference(output=Output(text="1"), tags=[CORRECT_TAG]),
|
|
18
|
+
Reference(output=Output(text="2"), tags=[]),
|
|
19
|
+
Reference(output=Output(text="3"), tags=[]),
|
|
20
|
+
]
|
|
21
|
+
assert instances[0].split == "train"
|
|
22
|
+
|
|
23
|
+
scenario = MMLUScenario(subject="anatomy")
|
|
24
|
+
instances = scenario.get_instances(tmpdir)
|
|
25
|
+
assert len(instances) == 154
|
|
26
|
+
assert instances[0].input == Input(text="What is the embryological origin of the hyoid bone?")
|
|
27
|
+
assert instances[0].references == [
|
|
28
|
+
Reference(output=Output(text="The first pharyngeal arch"), tags=[]),
|
|
29
|
+
Reference(output=Output(text="The first and second pharyngeal arches"), tags=[]),
|
|
30
|
+
Reference(output=Output(text="The second pharyngeal arch"), tags=[]),
|
|
31
|
+
Reference(output=Output(text="The second and third pharyngeal arches"), tags=[CORRECT_TAG]),
|
|
32
|
+
]
|
|
33
|
+
assert instances[0].split == "train"
|