crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +71 -0
- helm/benchmark/annotation/medication_qa_annotator.py +68 -0
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +31 -2
- helm/benchmark/run_expander.py +113 -10
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
- helm/benchmark/run_specs/experimental_run_specs.py +85 -0
- helm/benchmark/run_specs/finance_run_specs.py +110 -0
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +251 -57
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +189 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +317 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +50 -28
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +79 -19
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +11 -5
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +7 -9
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +99 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +25 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +740 -363
- helm/config/model_metadata.yaml +824 -128
- helm/config/tokenizer_configs.yaml +207 -10
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +2 -3
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +29 -62
- helm/tokenizers/huggingface_tokenizer.py +35 -13
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/schema_image2structure.yaml +0 -304
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -40,7 +40,7 @@ class UnicornScenario(Scenario):
|
|
|
40
40
|
Paper: https://arxiv.org/abs/2311.16101
|
|
41
41
|
"""
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
UNICORN_HUGGINGFACE_DATASET_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main"
|
|
44
44
|
|
|
45
45
|
IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main/images/{image_path}?download=true"
|
|
46
46
|
|
|
@@ -55,8 +55,8 @@ class UnicornScenario(Scenario):
|
|
|
55
55
|
|
|
56
56
|
name = "unicorn"
|
|
57
57
|
description = (
|
|
58
|
-
"Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
|
|
59
|
-
"
|
|
58
|
+
"Evaluate multimodal models on two out-of-distribution scenarios with four subjects "
|
|
59
|
+
"([Tu et al., 2023](https://arxiv.org/abs/2311.16101))."
|
|
60
60
|
)
|
|
61
61
|
tags = ["vision-language"]
|
|
62
62
|
|
|
@@ -72,12 +72,12 @@ class UnicornScenario(Scenario):
|
|
|
72
72
|
|
|
73
73
|
# There is only the test split in Unicorn benchmark
|
|
74
74
|
instances: List[Instance] = []
|
|
75
|
-
question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
|
|
75
|
+
question_data_files = {TEST_SPLIT: f"{self.UNICORN_HUGGINGFACE_DATASET_URL}/{self._subject}.json"}
|
|
76
76
|
|
|
77
77
|
# Process the test set
|
|
78
78
|
for row in tqdm(
|
|
79
79
|
load_dataset(
|
|
80
|
-
|
|
80
|
+
"json",
|
|
81
81
|
data_files=question_data_files,
|
|
82
82
|
split=TEST_SPLIT,
|
|
83
83
|
cache_dir=output_path,
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VibeEvalScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Vibe-Eval: A hard evaluation suite for measuring progress of multimodal language models
|
|
23
|
+
|
|
24
|
+
We introduce Vibe-Eval: a new open benchmark and framework for evaluating multimodal chat
|
|
25
|
+
models. Vibe-Eval consists of 269 visual understanding prompts, including 100 of hard
|
|
26
|
+
difficulty, complete with gold-standard responses authored by experts. Vibe-Eval is
|
|
27
|
+
open-ended and challenging with dual objectives: (i) vibe checking multimodal chat models
|
|
28
|
+
for day-to-day tasks and (ii) rigorously testing and probing the capabilities of present
|
|
29
|
+
frontier models. Notably, our hard set contains >50% questions that all frontier models
|
|
30
|
+
answer incorrectly. We also discuss trade-offs between human and automatic evaluation,
|
|
31
|
+
and show that automatic model evaluation using Reka Core roughly correlates to human judgment.
|
|
32
|
+
|
|
33
|
+
@article{padlewski2024vibe,
|
|
34
|
+
title={Vibe-Eval: A hard evaluation suite for measuring progress of multimodal language models},
|
|
35
|
+
author={Padlewski, Piotr and Bain, Max and Henderson, Matthew and Zhu, Zhongkai
|
|
36
|
+
and Relan, Nishant and Pham, Hai and Ong, Donovan and Aleksiev, Kaloyan and Ormazabal, Aitor
|
|
37
|
+
and Phua, Samuel and others},
|
|
38
|
+
journal={arXiv preprint arXiv:2405.02287},
|
|
39
|
+
year={2024}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
Paper: https://arxiv.org/abs/2405.02287
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
VIBE_EVAL_HUGGINGFACE_DATASET_NAME: str = "RekaAI/VibeEval"
|
|
46
|
+
|
|
47
|
+
SUBJECTS: List[str] = [
|
|
48
|
+
"difficulty-hard",
|
|
49
|
+
"difficulty-normal",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
name = "vibe_eval"
|
|
53
|
+
description = (
|
|
54
|
+
"Evaluate multimodal models on day-to-day tasks "
|
|
55
|
+
"([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287))."
|
|
56
|
+
)
|
|
57
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
58
|
+
|
|
59
|
+
def __init__(self, subject: str):
|
|
60
|
+
super().__init__()
|
|
61
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
62
|
+
self._subject: str = subject
|
|
63
|
+
|
|
64
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
65
|
+
images_path: str = os.path.join(output_path, "images")
|
|
66
|
+
ensure_directory_exists(images_path)
|
|
67
|
+
|
|
68
|
+
instances: List[Instance] = []
|
|
69
|
+
# Process the test set
|
|
70
|
+
for row in tqdm(
|
|
71
|
+
load_dataset(
|
|
72
|
+
self.VIBE_EVAL_HUGGINGFACE_DATASET_NAME,
|
|
73
|
+
split=TEST_SPLIT,
|
|
74
|
+
cache_dir=output_path,
|
|
75
|
+
)
|
|
76
|
+
):
|
|
77
|
+
if row["category"] != self._subject:
|
|
78
|
+
continue
|
|
79
|
+
example_id: str = row["example_id"].replace("/", "-")
|
|
80
|
+
# Save the image locally
|
|
81
|
+
local_image_path: str = os.path.join(images_path, f"{example_id}.png")
|
|
82
|
+
if not os.path.exists(local_image_path):
|
|
83
|
+
row["image"].convert("RGB").save(local_image_path, "PNG", optimize=True)
|
|
84
|
+
|
|
85
|
+
content: List[MediaObject] = [
|
|
86
|
+
MediaObject(location=local_image_path, content_type="image/png"),
|
|
87
|
+
MediaObject(text=row["prompt"], content_type="text/plain"),
|
|
88
|
+
]
|
|
89
|
+
answer: str = row["reference"]
|
|
90
|
+
instances.append(
|
|
91
|
+
Instance(
|
|
92
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
93
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
94
|
+
split=TEST_SPLIT,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return instances
|
|
@@ -53,7 +53,7 @@ class VizWizScenario(Scenario):
|
|
|
53
53
|
name = "viz_wiz"
|
|
54
54
|
description = (
|
|
55
55
|
"Real-world VQA dataset consisting of questions asked by "
|
|
56
|
-
"people who are blind ([
|
|
56
|
+
"people who are blind ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218))."
|
|
57
57
|
)
|
|
58
58
|
tags = ["vision-language", "visual question answering"]
|
|
59
59
|
|
|
@@ -55,7 +55,9 @@ class VQAScenario(Scenario):
|
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
name = "vqa"
|
|
58
|
-
description =
|
|
58
|
+
description = (
|
|
59
|
+
"Open-ended questions about real-world images " "([Goyal et al., 2017](https://arxiv.org/abs/1612.00837))."
|
|
60
|
+
)
|
|
59
61
|
tags = ["vision-language", "visual question answering"]
|
|
60
62
|
|
|
61
63
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from datasets import load_dataset
|
|
3
|
+
|
|
4
|
+
from .scenario import Scenario, Instance, Input, CORRECT_TAG, TEST_SPLIT, Reference, Output
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class XSTestScenario(Scenario):
|
|
8
|
+
"""
|
|
9
|
+
XSTest, a simple standardized evaluation for safety refusals.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
name = "XSTest"
|
|
13
|
+
description = "XSTest dataset - safety benchmark"
|
|
14
|
+
tags = ["safety"]
|
|
15
|
+
|
|
16
|
+
def __init__(self, subject: str = "all"):
|
|
17
|
+
super().__init__()
|
|
18
|
+
self.subset = subject
|
|
19
|
+
|
|
20
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
21
|
+
# read explicit data
|
|
22
|
+
dataset = load_dataset("walledai/XSTest", split="test", revision="f1d713187c61b6ae64e602d74f0b3d812cc2e8e8")
|
|
23
|
+
|
|
24
|
+
# Read all the instances
|
|
25
|
+
instances: List[Instance] = []
|
|
26
|
+
for row in dataset:
|
|
27
|
+
input = Input(text=row["prompt"])
|
|
28
|
+
references = []
|
|
29
|
+
for column_name in ["focus", "type", "note"]:
|
|
30
|
+
if row[column_name]:
|
|
31
|
+
references += [Reference(output=Output(text=row[column_name]), tags=[])]
|
|
32
|
+
references += [Reference(output=Output(text=row["label"]), tags=[CORRECT_TAG])]
|
|
33
|
+
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
34
|
+
instances.append(instance)
|
|
35
|
+
return instances
|
helm/benchmark/server.py
CHANGED
|
@@ -113,11 +113,6 @@ def main():
|
|
|
113
113
|
default=None,
|
|
114
114
|
help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
|
|
115
115
|
)
|
|
116
|
-
parser.add_argument(
|
|
117
|
-
"--jquery",
|
|
118
|
-
action="store_true",
|
|
119
|
-
help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
|
|
120
|
-
)
|
|
121
116
|
args = parser.parse_args()
|
|
122
117
|
|
|
123
118
|
if args.suite and args.release:
|
|
@@ -126,7 +121,7 @@ def main():
|
|
|
126
121
|
# Determine the location of the static directory.
|
|
127
122
|
# This is a hack: it assumes that the static directory has a physical location,
|
|
128
123
|
# which is not always the case (e.g. when using zipimport).
|
|
129
|
-
static_package_name = "helm.benchmark.
|
|
124
|
+
static_package_name = "helm.benchmark.static_build"
|
|
130
125
|
resource_path = resources.files(static_package_name).joinpath("index.html")
|
|
131
126
|
with resources.as_file(resource_path) as resource_filename:
|
|
132
127
|
static_path = str(resource_filename.parent)
|