crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +32 -45
- helm/benchmark/annotation/medication_qa_annotator.py +31 -44
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +56 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +78 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +92 -21
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +124 -7
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +96 -91
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +2 -3
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/images_utils.py +6 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +315 -332
- helm/config/model_metadata.yaml +384 -110
- helm/config/tokenizer_configs.yaml +116 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +1 -2
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py
RENAMED
|
@@ -1,24 +1,26 @@
|
|
|
1
|
-
from typing import Dict, List, Any
|
|
1
|
+
from typing import Dict, List, Any, Optional
|
|
2
2
|
|
|
3
|
+
from helm.benchmark.annotation.image2struct.image_compiler_annotator import CompilationError
|
|
3
4
|
from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
4
|
-
from helm.benchmark.scenarios.vision_language.
|
|
5
|
+
from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
|
|
5
6
|
Image2StructureScenario,
|
|
6
7
|
PROCESSED,
|
|
7
8
|
DIFFICULTY_ALL,
|
|
8
9
|
)
|
|
9
|
-
from helm.benchmark.scenarios.vision_language.
|
|
10
|
-
from helm.benchmark.scenarios.vision_language.
|
|
10
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage.jekyll_server import JekyllServer
|
|
11
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage.driver import (
|
|
11
12
|
save_random_screenshot,
|
|
12
13
|
ScreenshotOptions,
|
|
13
14
|
)
|
|
14
|
-
from helm.benchmark.scenarios.vision_language.
|
|
15
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
|
|
15
16
|
from helm.common.general import ensure_directory_exists
|
|
16
17
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
18
|
+
from helm.common.hierarchical_logger import hlog
|
|
17
19
|
|
|
18
20
|
try:
|
|
19
21
|
from html2text import HTML2Text
|
|
20
22
|
except ModuleNotFoundError as e:
|
|
21
|
-
handle_module_not_found_error(e, suggestions=["
|
|
23
|
+
handle_module_not_found_error(e, suggestions=["image2struct"])
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
import base64
|
|
@@ -73,28 +75,48 @@ def serve_and_take_screenshot(
|
|
|
73
75
|
if not success:
|
|
74
76
|
# This runs on examples that are not expected to fail
|
|
75
77
|
server.stop()
|
|
78
|
+
hlog(f"Failed to start the Jekyll server: {repo_path} on port {port}. Will raise a ValueError.")
|
|
76
79
|
raise ValueError(f"Jekyll server failed to start: {repo_path}")
|
|
77
80
|
|
|
78
81
|
# Take a screenshot of a random page
|
|
79
82
|
success = False
|
|
80
|
-
error: Exception
|
|
81
|
-
|
|
83
|
+
error: Optional[Exception] = None
|
|
84
|
+
|
|
85
|
+
MAX_TRIES_ALL_ERRORS = 3
|
|
86
|
+
MAX_TRIES_CONNECTION_REFUSED = 5
|
|
87
|
+
MAX_TRIES = max(MAX_TRIES_ALL_ERRORS, MAX_TRIES_CONNECTION_REFUSED)
|
|
88
|
+
for compilation_attempt in range(MAX_TRIES):
|
|
82
89
|
try:
|
|
83
90
|
infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
|
|
84
91
|
success = True
|
|
85
92
|
break
|
|
86
93
|
except Exception as e:
|
|
87
|
-
|
|
88
|
-
|
|
94
|
+
error = e
|
|
95
|
+
|
|
96
|
+
if "net::ERR_CONNECTION_REFUSED" in str(e) and compilation_attempt < MAX_TRIES_CONNECTION_REFUSED:
|
|
97
|
+
hlog(
|
|
98
|
+
f"Failed to take a screenshot: ERR_CONNECTION_REFUSED [Attempt {compilation_attempt + 1}/"
|
|
99
|
+
f"{MAX_TRIES_CONNECTION_REFUSED}]. Error: {e}. Retrying..."
|
|
100
|
+
)
|
|
89
101
|
server.stop()
|
|
90
102
|
time.sleep(0.5)
|
|
91
103
|
server.start()
|
|
92
104
|
time.sleep(0.5)
|
|
105
|
+
elif compilation_attempt < MAX_TRIES_ALL_ERRORS:
|
|
106
|
+
hlog(
|
|
107
|
+
f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
|
|
108
|
+
f" Error: {e}. Retrying..."
|
|
109
|
+
)
|
|
93
110
|
else:
|
|
94
111
|
# Do not retry
|
|
112
|
+
hlog(
|
|
113
|
+
f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
|
|
114
|
+
f" Error: {e}. Raising CompilationError."
|
|
115
|
+
)
|
|
95
116
|
break
|
|
117
|
+
|
|
96
118
|
if not success:
|
|
97
|
-
raise
|
|
119
|
+
raise CompilationError(f"Failed to take a screenshot: {error}")
|
|
98
120
|
|
|
99
121
|
# Stop the server
|
|
100
122
|
server.stop()
|
|
@@ -129,7 +151,7 @@ class WebpageScenario(Image2StructureScenario):
|
|
|
129
151
|
)
|
|
130
152
|
|
|
131
153
|
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
|
|
132
|
-
SUBSETS = ["css", "html", "javascript", "
|
|
154
|
+
SUBSETS = ["css", "html", "javascript", "wild", "wild_legacy"]
|
|
133
155
|
MAX_TRIES: int = 5
|
|
134
156
|
ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
|
|
135
157
|
|
|
@@ -167,6 +189,13 @@ class WebpageScenario(Image2StructureScenario):
|
|
|
167
189
|
shutil.rmtree(assets_save_path)
|
|
168
190
|
ensure_directory_exists(assets_save_path)
|
|
169
191
|
|
|
192
|
+
if "wild" in self._subset:
|
|
193
|
+
# There is no stucture
|
|
194
|
+
del row["assets"]
|
|
195
|
+
row["assets_paths"] = []
|
|
196
|
+
row["assets_names"] = []
|
|
197
|
+
return row
|
|
198
|
+
|
|
170
199
|
# Structure is a base64 encoding of the repo
|
|
171
200
|
if self._output_path is None:
|
|
172
201
|
raise ValueError("Output path not set")
|
|
@@ -51,7 +51,7 @@ class MathVistaScenario(Scenario):
|
|
|
51
51
|
name = "math_vista"
|
|
52
52
|
description = (
|
|
53
53
|
"A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
|
|
54
|
-
"([
|
|
54
|
+
"([Lu et al., 2024](https://arxiv.org/abs/2310.02255))."
|
|
55
55
|
)
|
|
56
56
|
tags = ["vision-language", "reasoning", "math"]
|
|
57
57
|
|
|
@@ -38,10 +38,10 @@ class MementosScenario(Scenario):
|
|
|
38
38
|
Paper: https://arxiv.org/abs/2401.10529
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
|
-
MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "
|
|
41
|
+
MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "RussWang96/unofficial_mementos_dataset"
|
|
42
42
|
|
|
43
43
|
IMAGE_URL: str = (
|
|
44
|
-
"https://huggingface.co/datasets/
|
|
44
|
+
"https://huggingface.co/datasets/RussWang96/unofficial_mementos_dataset/resolve/main/"
|
|
45
45
|
+ "{subject}/{split}/{file_name}?download=true"
|
|
46
46
|
)
|
|
47
47
|
|
|
@@ -56,7 +56,7 @@ class MementosScenario(Scenario):
|
|
|
56
56
|
name = "mementos"
|
|
57
57
|
description = (
|
|
58
58
|
"A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
|
|
59
|
-
" ([
|
|
59
|
+
" ([Wang et al., 2024](https://arxiv.org/abs/2401.10529))."
|
|
60
60
|
)
|
|
61
61
|
tags = ["vision-language"]
|
|
62
62
|
|
|
@@ -48,14 +48,14 @@ class MMSafetyBenchScenario(Scenario):
|
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
QUESTIONS_URL_TEMPLATE: str = (
|
|
51
|
-
"https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/
|
|
51
|
+
"https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/processed_questions/{dataset}.json"
|
|
52
52
|
)
|
|
53
53
|
IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
|
|
54
54
|
|
|
55
55
|
name = "mm_safety_bench"
|
|
56
56
|
description = (
|
|
57
57
|
"Expose the vulnerability of open-source VLMs with toxic and biased content "
|
|
58
|
-
"([
|
|
58
|
+
"([Liu et al., 2023](https://arxiv.org/abs/2311.17600))."
|
|
59
59
|
)
|
|
60
60
|
tags = ["vision-language", "bias", "toxicity"]
|
|
61
61
|
|
|
@@ -19,22 +19,22 @@ from helm.common.general import ensure_directory_exists
|
|
|
19
19
|
|
|
20
20
|
class MMEScenario(Scenario):
|
|
21
21
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
22
|
+
MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
|
|
23
|
+
|
|
24
|
+
Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
|
|
25
|
+
multimodal tasks, showing amazing emergent abilities in recent studies. However,
|
|
26
|
+
it is difficult for these case studies to fully reflect the performance of MLLM,
|
|
27
|
+
lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
|
|
28
|
+
the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
|
|
29
|
+
and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
|
|
30
|
+
that may arise from direct use of public datasets for evaluation, the annotations
|
|
31
|
+
of instruction-answer pairs are all manually designed. The concise instruction design
|
|
32
|
+
allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
|
|
33
|
+
Besides, with such an instruction, we can also easily carry out quantitative
|
|
34
|
+
statistics. We rephrase the answer type of MME to multiple-choice question-answering.
|
|
35
|
+
We use the multiple-choice metrics for 14 different evaluation tasks.
|
|
36
|
+
|
|
37
|
+
@article{fu2023mme,
|
|
38
38
|
title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
|
|
39
39
|
author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
|
|
40
40
|
Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
|
|
@@ -43,7 +43,7 @@ class MMEScenario(Scenario):
|
|
|
43
43
|
year={2023}
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
Paper: https://arxiv.org/abs/2306.13394
|
|
47
47
|
"""
|
|
48
48
|
|
|
49
49
|
MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
|
|
@@ -66,7 +66,10 @@ class MMEScenario(Scenario):
|
|
|
66
66
|
]
|
|
67
67
|
|
|
68
68
|
name = "mme"
|
|
69
|
-
description =
|
|
69
|
+
description = (
|
|
70
|
+
"Evaluate multimodal models on their perception and cognition abilities on a total of 14 subtasks "
|
|
71
|
+
"([Fu et al., 2023](https://arxiv.org/abs/2306.13394))."
|
|
72
|
+
)
|
|
70
73
|
tags = ["vision-language"]
|
|
71
74
|
options: List[str] = ["Yes", "No"]
|
|
72
75
|
|
|
@@ -81,7 +81,7 @@ class MMMUScenario(Scenario):
|
|
|
81
81
|
name = "mmmu"
|
|
82
82
|
description = (
|
|
83
83
|
"Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
|
|
84
|
-
"subject knowledge and deliberate reasoning ([
|
|
84
|
+
"subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502))."
|
|
85
85
|
)
|
|
86
86
|
tags = ["vision-language"]
|
|
87
87
|
|
|
@@ -186,7 +186,7 @@ class PAIRSScenario(Scenario):
|
|
|
186
186
|
name = "pairs"
|
|
187
187
|
description = (
|
|
188
188
|
"Examining gender and racial bias in VLMs Using a Novel Dataset of Parallel Images. "
|
|
189
|
-
"([
|
|
189
|
+
"([Fraser et al., 2024](https://arxiv.org/abs/2402.05779))."
|
|
190
190
|
)
|
|
191
191
|
tags = ["vision-language", "bias"]
|
|
192
192
|
|
|
@@ -42,7 +42,8 @@ class POPEScenario(Scenario):
|
|
|
42
42
|
|
|
43
43
|
name = "pope"
|
|
44
44
|
description = (
|
|
45
|
-
"Open-ended questions about hallucination images
|
|
45
|
+
"Open-ended questions about hallucination images "
|
|
46
|
+
"([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20/))."
|
|
46
47
|
)
|
|
47
48
|
tags = ["vision-language", "visual question answering"]
|
|
48
49
|
options: List[str] = ["Yes", "No"]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.images_utils import generate_hash
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RealWorldQAScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
RealWorldQA is a benchmark designed for real-world understanding. The dataset consists of anonymized
|
|
23
|
+
images taken from vehicles, in addition to other real-world images.
|
|
24
|
+
|
|
25
|
+
Blog post: https://x.ai/blog/grok-1.5v
|
|
26
|
+
Website: https://huggingface.co/datasets/xai-org/RealworldQA
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
HUGGINGFACE_DATASET_NAME: str = "xai-org/RealworldQA"
|
|
30
|
+
|
|
31
|
+
name = "real_world_qa"
|
|
32
|
+
description = (
|
|
33
|
+
"A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models "
|
|
34
|
+
"([xAI, 2024](https://x.ai/blog/grok-1.5v))."
|
|
35
|
+
)
|
|
36
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
37
|
+
|
|
38
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
39
|
+
instances: List[Instance] = []
|
|
40
|
+
for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=TEST_SPLIT, cache_dir=output_path)):
|
|
41
|
+
# Save the image to disk
|
|
42
|
+
image = row["image"]
|
|
43
|
+
image_file_name: str = generate_hash(image) + ".jpg"
|
|
44
|
+
local_image_path: str = os.path.join(output_path, image_file_name)
|
|
45
|
+
if not os.path.exists(local_image_path):
|
|
46
|
+
image.save(local_image_path)
|
|
47
|
+
|
|
48
|
+
content: List[MediaObject] = [
|
|
49
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
50
|
+
MediaObject(text=row["question"], content_type="text/plain"),
|
|
51
|
+
]
|
|
52
|
+
references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
|
|
53
|
+
instances.append(
|
|
54
|
+
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=TEST_SPLIT)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return instances
|
|
@@ -35,10 +35,10 @@ class SEEDBenchScenario(Scenario):
|
|
|
35
35
|
the multiple-choice metric for evaluating the performance of models.
|
|
36
36
|
|
|
37
37
|
@article{li2023seed,
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
38
|
+
title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
|
|
39
|
+
author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
|
|
40
|
+
journal={arXiv preprint arXiv:2307.16125},
|
|
41
|
+
year={2023}
|
|
42
42
|
}
|
|
43
43
|
|
|
44
44
|
Paper: https://arxiv.org/abs/2307.16125
|
|
@@ -59,7 +59,9 @@ class SEEDBenchScenario(Scenario):
|
|
|
59
59
|
}
|
|
60
60
|
|
|
61
61
|
name = "seed_bench"
|
|
62
|
-
description =
|
|
62
|
+
description = (
|
|
63
|
+
"Evaluate multimodal models on 9 evaluation aspects " "([Li et al., 2023](https://arxiv.org/abs/2307.16125))."
|
|
64
|
+
)
|
|
63
65
|
tags = ["vision-language"]
|
|
64
66
|
|
|
65
67
|
def __init__(self, subject: str):
|
|
@@ -55,8 +55,8 @@ class UnicornScenario(Scenario):
|
|
|
55
55
|
|
|
56
56
|
name = "unicorn"
|
|
57
57
|
description = (
|
|
58
|
-
"Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
|
|
59
|
-
"
|
|
58
|
+
"Evaluate multimodal models on two out-of-distribution scenarios with four subjects "
|
|
59
|
+
"([Tu et al., 2023](https://arxiv.org/abs/2311.16101))."
|
|
60
60
|
)
|
|
61
61
|
tags = ["vision-language"]
|
|
62
62
|
|
|
@@ -39,7 +39,7 @@ class VibeEvalScenario(Scenario):
|
|
|
39
39
|
year={2024}
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
-
Paper: https://arxiv.org/abs/
|
|
42
|
+
Paper: https://arxiv.org/abs/2405.02287
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
45
|
VIBE_EVAL_HUGGINGFACE_DATASET_NAME: str = "RekaAI/VibeEval"
|
|
@@ -50,8 +50,11 @@ class VibeEvalScenario(Scenario):
|
|
|
50
50
|
]
|
|
51
51
|
|
|
52
52
|
name = "vibe_eval"
|
|
53
|
-
description =
|
|
54
|
-
|
|
53
|
+
description = (
|
|
54
|
+
"Evaluate multimodal models on day-to-day tasks "
|
|
55
|
+
"([Padlewski et al., 2024](https://arxiv.org/abs/2405.02287))."
|
|
56
|
+
)
|
|
57
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
55
58
|
|
|
56
59
|
def __init__(self, subject: str):
|
|
57
60
|
super().__init__()
|
|
@@ -53,7 +53,7 @@ class VizWizScenario(Scenario):
|
|
|
53
53
|
name = "viz_wiz"
|
|
54
54
|
description = (
|
|
55
55
|
"Real-world VQA dataset consisting of questions asked by "
|
|
56
|
-
"people who are blind ([
|
|
56
|
+
"people who are blind ([Gurari et al., 2018](https://arxiv.org/abs/1802.08218))."
|
|
57
57
|
)
|
|
58
58
|
tags = ["vision-language", "visual question answering"]
|
|
59
59
|
|
|
@@ -55,7 +55,9 @@ class VQAScenario(Scenario):
|
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
name = "vqa"
|
|
58
|
-
description =
|
|
58
|
+
description = (
|
|
59
|
+
"Open-ended questions about real-world images " "([Goyal et al., 2017](https://arxiv.org/abs/1612.00837))."
|
|
60
|
+
)
|
|
59
61
|
tags = ["vision-language", "visual question answering"]
|
|
60
62
|
|
|
61
63
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from datasets import load_dataset
|
|
3
|
+
|
|
4
|
+
from .scenario import Scenario, Instance, Input, CORRECT_TAG, TEST_SPLIT, Reference, Output
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class XSTestScenario(Scenario):
|
|
8
|
+
"""
|
|
9
|
+
XSTest, a simple standardized evaluation for safety refusals.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
name = "XSTest"
|
|
13
|
+
description = "XSTest dataset - safety benchmark"
|
|
14
|
+
tags = ["safety"]
|
|
15
|
+
|
|
16
|
+
def __init__(self, subject: str = "all"):
|
|
17
|
+
super().__init__()
|
|
18
|
+
self.subset = subject
|
|
19
|
+
|
|
20
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
21
|
+
# read explicit data
|
|
22
|
+
dataset = load_dataset("walledai/XSTest", split="test", revision="f1d713187c61b6ae64e602d74f0b3d812cc2e8e8")
|
|
23
|
+
|
|
24
|
+
# Read all the instances
|
|
25
|
+
instances: List[Instance] = []
|
|
26
|
+
for row in dataset:
|
|
27
|
+
input = Input(text=row["prompt"])
|
|
28
|
+
references = []
|
|
29
|
+
for column_name in ["focus", "type", "note"]:
|
|
30
|
+
if row[column_name]:
|
|
31
|
+
references += [Reference(output=Output(text=row[column_name]), tags=[])]
|
|
32
|
+
references += [Reference(output=Output(text=row["label"]), tags=[CORRECT_TAG])]
|
|
33
|
+
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
34
|
+
instances.append(instance)
|
|
35
|
+
return instances
|
helm/benchmark/server.py
CHANGED
|
@@ -113,11 +113,6 @@ def main():
|
|
|
113
113
|
default=None,
|
|
114
114
|
help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
|
|
115
115
|
)
|
|
116
|
-
parser.add_argument(
|
|
117
|
-
"--jquery",
|
|
118
|
-
action="store_true",
|
|
119
|
-
help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
|
|
120
|
-
)
|
|
121
116
|
args = parser.parse_args()
|
|
122
117
|
|
|
123
118
|
if args.suite and args.release:
|
|
@@ -126,7 +121,7 @@ def main():
|
|
|
126
121
|
# Determine the location of the static directory.
|
|
127
122
|
# This is a hack: it assumes that the static directory has a physical location,
|
|
128
123
|
# which is not always the case (e.g. when using zipimport).
|
|
129
|
-
static_package_name = "helm.benchmark.
|
|
124
|
+
static_package_name = "helm.benchmark.static_build"
|
|
130
125
|
resource_path = resources.files(static_package_name).joinpath("index.html")
|
|
131
126
|
with resources.as_file(resource_path) as resource_filename:
|
|
132
127
|
static_path = str(resource_filename.parent)
|