crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +71 -0
- helm/benchmark/annotation/medication_qa_annotator.py +68 -0
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +31 -2
- helm/benchmark/run_expander.py +113 -10
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
- helm/benchmark/run_specs/experimental_run_specs.py +85 -0
- helm/benchmark/run_specs/finance_run_specs.py +110 -0
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +251 -57
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +189 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +317 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +50 -28
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +79 -19
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +11 -5
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +7 -9
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +99 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +25 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +740 -363
- helm/config/model_metadata.yaml +824 -128
- helm/config/tokenizer_configs.yaml +207 -10
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +2 -3
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +29 -62
- helm/tokenizers/huggingface_tokenizer.py +35 -13
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/schema_image2structure.yaml +0 -304
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -22,6 +22,10 @@ from helm.common.general import ensure_directory_exists
|
|
|
22
22
|
from helm.common.hierarchical_logger import hlog
|
|
23
23
|
|
|
24
24
|
PROCESSED: str = "processed"
|
|
25
|
+
DIFFICULTY_ALL = "all"
|
|
26
|
+
DIFFICULTY_EASY = "easy"
|
|
27
|
+
DIFFICULTY_MEDIUM = "medium"
|
|
28
|
+
DIFFICULTY_HARD = "hard"
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
class Image2StructureScenario(Scenario):
|
|
@@ -38,13 +42,16 @@ class Image2StructureScenario(Scenario):
|
|
|
38
42
|
VALID_SPLIT: "validation",
|
|
39
43
|
}
|
|
40
44
|
|
|
41
|
-
def __init__(
|
|
45
|
+
def __init__(
|
|
46
|
+
self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT, difficulty: str = DIFFICULTY_ALL
|
|
47
|
+
):
|
|
42
48
|
super().__init__()
|
|
43
49
|
assert subset in self.SUBSETS, f"Invalid subset: {subset}"
|
|
44
50
|
self._subset: str = subset
|
|
45
51
|
self._recompile_prompt: bool = recompile_prompt
|
|
46
52
|
self._split: str = split
|
|
47
53
|
self._output_path: Optional[str] = None
|
|
54
|
+
self._difficulty: str = difficulty
|
|
48
55
|
|
|
49
56
|
def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
|
|
50
57
|
# By default, there are no assets
|
|
@@ -110,6 +117,10 @@ class Image2StructureScenario(Scenario):
|
|
|
110
117
|
)
|
|
111
118
|
continue
|
|
112
119
|
|
|
120
|
+
# Filter by difficulty
|
|
121
|
+
if self._difficulty != DIFFICULTY_ALL and row["difficulty"] != self._difficulty:
|
|
122
|
+
continue
|
|
123
|
+
|
|
113
124
|
# Step 1: Preprocess the row
|
|
114
125
|
row = self.preprocess_row(row, assets_path)
|
|
115
126
|
|
|
@@ -158,7 +169,7 @@ class Image2StructureScenario(Scenario):
|
|
|
158
169
|
# representing the structure (such as LaTeX code)
|
|
159
170
|
multimedia_object = MultimediaObject([image_object])
|
|
160
171
|
reference = Reference(
|
|
161
|
-
output=Output(text=row["text"], multimedia_content=multimedia_object),
|
|
172
|
+
output=Output(text=row["text"] if "text" in row else "", multimedia_content=multimedia_object),
|
|
162
173
|
tags=[CORRECT_TAG],
|
|
163
174
|
)
|
|
164
175
|
else:
|
|
@@ -1,22 +1,18 @@
|
|
|
1
|
-
from helm.benchmark.scenarios.
|
|
2
|
-
from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
|
|
1
|
+
from helm.benchmark.scenarios.vision_language.image2struct.utils_latex import (
|
|
3
2
|
latex_to_image,
|
|
4
3
|
strip_unnecessary_latex_parts,
|
|
5
4
|
)
|
|
6
|
-
from helm.benchmark.scenarios.vision_language.
|
|
5
|
+
from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import Image2StructureScenario
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class LatexScenario(Image2StructureScenario):
|
|
10
9
|
BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters." # noqa: E501
|
|
11
10
|
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
|
|
12
|
-
SUBSETS = ["equation", "table", "plot", "algorithm"]
|
|
11
|
+
SUBSETS = ["equation", "table", "plot", "algorithm", "wild", "wild_legacy"]
|
|
13
12
|
|
|
14
13
|
name = "image2latex"
|
|
15
14
|
description = "Evaluate multimodal models on Latex generation to recreate a provided image"
|
|
16
15
|
|
|
17
|
-
def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
|
|
18
|
-
super().__init__(subset, recompile_prompt, split)
|
|
19
|
-
|
|
20
16
|
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
21
17
|
image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
|
|
22
18
|
image.save(destination_path)
|
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py
RENAMED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
from helm.benchmark.scenarios.
|
|
2
|
-
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
|
|
1
|
+
from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import Image2StructureScenario
|
|
3
2
|
|
|
4
3
|
|
|
5
4
|
class MusicSheetScenario(Image2StructureScenario):
|
|
@@ -13,8 +12,5 @@ class MusicSheetScenario(Image2StructureScenario):
|
|
|
13
12
|
name = "image2musicsheet"
|
|
14
13
|
description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
|
|
15
14
|
|
|
16
|
-
def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
|
|
17
|
-
super().__init__(subset, recompile_prompt, split)
|
|
18
|
-
|
|
19
15
|
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
20
16
|
raise Exception("Music sheets have no ground truth, compilation is not possible")
|
|
@@ -5,6 +5,7 @@ import os
|
|
|
5
5
|
import re
|
|
6
6
|
|
|
7
7
|
from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
8
9
|
|
|
9
10
|
try:
|
|
10
11
|
from latex import build_pdf
|
|
@@ -12,14 +13,13 @@ try:
|
|
|
12
13
|
from PIL import ImageOps
|
|
13
14
|
from PIL.Image import Image
|
|
14
15
|
except ModuleNotFoundError as e:
|
|
15
|
-
handle_module_not_found_error(e, suggestions=["
|
|
16
|
+
handle_module_not_found_error(e, suggestions=["image2struct"])
|
|
16
17
|
|
|
17
18
|
# LaTeX preamble
|
|
18
19
|
# Make sure to install "latex-full".
|
|
19
20
|
TEX_INCLUDES = r"""
|
|
20
21
|
\usepackage{amsmath,amssymb,amsfonts}
|
|
21
22
|
\usepackage{graphicx}
|
|
22
|
-
\usepackage{graphicx}
|
|
23
23
|
\usepackage{amsmath}
|
|
24
24
|
\usepackage{xcolor}
|
|
25
25
|
\usepackage{algorithm}
|
|
@@ -98,23 +98,19 @@ def pdf_to_image(
|
|
|
98
98
|
|
|
99
99
|
def strip_unnecessary_latex_parts(latex_code: str) -> str:
|
|
100
100
|
"""Strip unnecessary parts of the LaTeX code."""
|
|
101
|
-
|
|
102
101
|
# Remove comments
|
|
103
102
|
minimal_latex_code = re.sub(r"%.*?\n", "\n", latex_code)
|
|
104
|
-
|
|
105
103
|
# Remove \documentclass and any \usepackage lines
|
|
106
|
-
minimal_latex_code = re.sub(r"\\documentclass\{.*?\}
|
|
107
|
-
minimal_latex_code = re.sub(r"\\
|
|
108
|
-
|
|
104
|
+
minimal_latex_code = re.sub(r"\\documentclass(\[.*?\])?\{.*?\}", "", latex_code)
|
|
105
|
+
minimal_latex_code = re.sub(r"\\documentstyle(\[.*?\])?\{.*?\}", "", minimal_latex_code)
|
|
106
|
+
minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}", "", minimal_latex_code)
|
|
109
107
|
# Remove everything before \begin{document} and including it, and everything after \end{document}
|
|
110
108
|
minimal_latex_code = re.sub(r"\\begin\{document\}\n*", "", minimal_latex_code, flags=re.DOTALL)
|
|
111
109
|
minimal_latex_code = re.sub(r"\\end\{document\}.*", "", minimal_latex_code, flags=re.DOTALL)
|
|
112
|
-
|
|
113
110
|
# Ensure \begin{...} is followed by a \n
|
|
114
111
|
minimal_latex_code = re.sub(r"(\\begin\{.*?\}(\[.*?\])?)(?!\n)", r"\1\n", minimal_latex_code)
|
|
115
112
|
# Ensure \end{...} has a \n before it
|
|
116
113
|
minimal_latex_code = re.sub(r"(\\end\{.*?\})(?!\n)", r"\1\n", minimal_latex_code)
|
|
117
|
-
|
|
118
114
|
# Normalize space sequences to a single space globally
|
|
119
115
|
minimal_latex_code = re.sub(r" +", " ", minimal_latex_code)
|
|
120
116
|
# Replace tabs with a single space
|
|
@@ -123,7 +119,6 @@ def strip_unnecessary_latex_parts(latex_code: str) -> str:
|
|
|
123
119
|
minimal_latex_code = re.sub(r"^[ \t]+|[ \t]+$", "", minimal_latex_code, flags=re.MULTILINE)
|
|
124
120
|
# Remove unnecessary whitespace - multiple empty lines and tabulations
|
|
125
121
|
minimal_latex_code = re.sub(r"\n\s*\n", "\n", minimal_latex_code)
|
|
126
|
-
|
|
127
122
|
return minimal_latex_code.strip()
|
|
128
123
|
|
|
129
124
|
|
|
@@ -226,25 +221,21 @@ def handle_latex_error(
|
|
|
226
221
|
# Error format: "LaTeX Error: Environment <env> undefined."
|
|
227
222
|
undefined_search = re.search(r"LaTeX Error: Environment (.*) undefined", str_e)
|
|
228
223
|
if undefined_search:
|
|
229
|
-
#
|
|
230
|
-
if
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
#
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
# does not exist, so we raise an error
|
|
245
|
-
raise RuntimeError(str(e)) from e
|
|
246
|
-
|
|
247
|
-
fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
|
|
224
|
+
# Here we try to manually solve the missing environment.
|
|
225
|
+
# This is either executed on the second rety or the first if no changements
|
|
226
|
+
# were made in the first retry.
|
|
227
|
+
assert TEX_INCLUDES in fixed_code, f"TEX_INCLUDES should be present in the code. code={fixed_code}"
|
|
228
|
+
# TEX_INCLUDES is already present, so we add the missing package
|
|
229
|
+
# Since we cannot know the name of the package that contains the missing environment,
|
|
230
|
+
# we simply hope that they are named the same way.
|
|
231
|
+
env_undefined: str = undefined_search.group(1)
|
|
232
|
+
|
|
233
|
+
if f"\\usepackage{{{env_undefined}}}" in fixed_code:
|
|
234
|
+
# We already tried to include the missing package, but it probably
|
|
235
|
+
# does not exist, so we raise an error
|
|
236
|
+
raise RuntimeError(str(e)) from e
|
|
237
|
+
|
|
238
|
+
fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
|
|
248
239
|
|
|
249
240
|
# Try again with the fixed code (if the fixed code is different from the original code)
|
|
250
241
|
if fixed_code != original_latex_code:
|
|
@@ -313,23 +304,24 @@ def latex_to_image(
|
|
|
313
304
|
|
|
314
305
|
# 2. Add preamble
|
|
315
306
|
# 2.1. Remove \documentclass if present to make sure we use our own
|
|
316
|
-
documentclass_search = re.search(r"\\documentclass\{
|
|
307
|
+
documentclass_search = re.search(r"\\documentclass(\[.*?\])?\{.*?\}", original_latex_code)
|
|
308
|
+
documentstyle_search = re.search(r"\\documentstyle(\[.*?\])?\{.*?\}", original_latex_code)
|
|
317
309
|
if documentclass_search:
|
|
318
|
-
|
|
319
|
-
original_latex_code = original_latex_code.replace(
|
|
310
|
+
matching_string = documentclass_search.group()
|
|
311
|
+
original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
|
|
312
|
+
elif documentstyle_search:
|
|
313
|
+
matching_string = documentstyle_search.group()
|
|
314
|
+
original_latex_code = original_latex_code.replace(matching_string, TEX_BEGIN_FILE)
|
|
320
315
|
else:
|
|
321
316
|
# If there is no \documentclass, we add our own
|
|
322
317
|
original_latex_code = TEX_BEGIN_FILE + "\n\n" + original_latex_code
|
|
323
318
|
|
|
324
|
-
# 2.2. Add includes. In this
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
# If there are missing packages, in handle_latex_error, we will add TEX_INCLUDES after the begin document,
|
|
328
|
-
# which might define some packages twice, but often solves the problem.
|
|
329
|
-
if not re.search(r"\\usepackage\{.*\}", original_latex_code):
|
|
330
|
-
original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
|
|
319
|
+
# 2.2. Add includes. In this ste we remove all includes for the default ones.
|
|
320
|
+
original_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*\}", "", original_latex_code)
|
|
321
|
+
original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
|
|
331
322
|
|
|
332
323
|
latex_code: str = original_latex_code
|
|
324
|
+
hlog(f"Compiling LaTeX code:\n{latex_code}")
|
|
333
325
|
try:
|
|
334
326
|
pdf_stream = latex_to_pdf(latex_code, assets_path=assets_path)
|
|
335
327
|
image = pdf_to_image(pdf_stream, crop=crop, resize_to=resize_to)
|
|
@@ -6,7 +6,7 @@ try:
|
|
|
6
6
|
from selenium import webdriver
|
|
7
7
|
import selenium.common.exceptions
|
|
8
8
|
except ModuleNotFoundError as e:
|
|
9
|
-
handle_module_not_found_error(e, suggestions=["
|
|
9
|
+
handle_module_not_found_error(e, suggestions=["image2struct"])
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def init_driver(url: str, resolution: Tuple[int, int] = (1920, 1080)) -> webdriver.Chrome:
|
|
@@ -5,7 +5,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
|
|
|
5
5
|
try:
|
|
6
6
|
from html2text import HTML2Text
|
|
7
7
|
except ModuleNotFoundError as e:
|
|
8
|
-
handle_module_not_found_error(e, suggestions=["
|
|
8
|
+
handle_module_not_found_error(e, suggestions=["image2struct"])
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def convert_html_to_text(handler: HTML2Text, html: str) -> str:
|
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py
RENAMED
|
@@ -1,23 +1,26 @@
|
|
|
1
|
-
from typing import Dict, List, Any
|
|
1
|
+
from typing import Dict, List, Any, Optional
|
|
2
2
|
|
|
3
|
+
from helm.benchmark.annotation.image2struct.image_compiler_annotator import CompilationError
|
|
3
4
|
from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
4
|
-
from helm.benchmark.scenarios.vision_language.
|
|
5
|
+
from helm.benchmark.scenarios.vision_language.image2struct.image2struct_scenario import (
|
|
5
6
|
Image2StructureScenario,
|
|
6
7
|
PROCESSED,
|
|
8
|
+
DIFFICULTY_ALL,
|
|
7
9
|
)
|
|
8
|
-
from helm.benchmark.scenarios.vision_language.
|
|
9
|
-
from helm.benchmark.scenarios.vision_language.
|
|
10
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage.jekyll_server import JekyllServer
|
|
11
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage.driver import (
|
|
10
12
|
save_random_screenshot,
|
|
11
13
|
ScreenshotOptions,
|
|
12
14
|
)
|
|
13
|
-
from helm.benchmark.scenarios.vision_language.
|
|
15
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
|
|
14
16
|
from helm.common.general import ensure_directory_exists
|
|
15
17
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
18
|
+
from helm.common.hierarchical_logger import hlog
|
|
16
19
|
|
|
17
20
|
try:
|
|
18
21
|
from html2text import HTML2Text
|
|
19
22
|
except ModuleNotFoundError as e:
|
|
20
|
-
handle_module_not_found_error(e, suggestions=["
|
|
23
|
+
handle_module_not_found_error(e, suggestions=["image2struct"])
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
import base64
|
|
@@ -72,28 +75,48 @@ def serve_and_take_screenshot(
|
|
|
72
75
|
if not success:
|
|
73
76
|
# This runs on examples that are not expected to fail
|
|
74
77
|
server.stop()
|
|
78
|
+
hlog(f"Failed to start the Jekyll server: {repo_path} on port {port}. Will raise a ValueError.")
|
|
75
79
|
raise ValueError(f"Jekyll server failed to start: {repo_path}")
|
|
76
80
|
|
|
77
81
|
# Take a screenshot of a random page
|
|
78
82
|
success = False
|
|
79
|
-
error: Exception
|
|
80
|
-
|
|
83
|
+
error: Optional[Exception] = None
|
|
84
|
+
|
|
85
|
+
MAX_TRIES_ALL_ERRORS = 3
|
|
86
|
+
MAX_TRIES_CONNECTION_REFUSED = 5
|
|
87
|
+
MAX_TRIES = max(MAX_TRIES_ALL_ERRORS, MAX_TRIES_CONNECTION_REFUSED)
|
|
88
|
+
for compilation_attempt in range(MAX_TRIES):
|
|
81
89
|
try:
|
|
82
90
|
infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
|
|
83
91
|
success = True
|
|
84
92
|
break
|
|
85
93
|
except Exception as e:
|
|
86
|
-
|
|
87
|
-
|
|
94
|
+
error = e
|
|
95
|
+
|
|
96
|
+
if "net::ERR_CONNECTION_REFUSED" in str(e) and compilation_attempt < MAX_TRIES_CONNECTION_REFUSED:
|
|
97
|
+
hlog(
|
|
98
|
+
f"Failed to take a screenshot: ERR_CONNECTION_REFUSED [Attempt {compilation_attempt + 1}/"
|
|
99
|
+
f"{MAX_TRIES_CONNECTION_REFUSED}]. Error: {e}. Retrying..."
|
|
100
|
+
)
|
|
88
101
|
server.stop()
|
|
89
102
|
time.sleep(0.5)
|
|
90
103
|
server.start()
|
|
91
104
|
time.sleep(0.5)
|
|
105
|
+
elif compilation_attempt < MAX_TRIES_ALL_ERRORS:
|
|
106
|
+
hlog(
|
|
107
|
+
f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
|
|
108
|
+
f" Error: {e}. Retrying..."
|
|
109
|
+
)
|
|
92
110
|
else:
|
|
93
111
|
# Do not retry
|
|
112
|
+
hlog(
|
|
113
|
+
f"Failed to take a screenshot: Unknown [Attempt {compilation_attempt + 1}/{MAX_TRIES_ALL_ERRORS}]."
|
|
114
|
+
f" Error: {e}. Raising CompilationError."
|
|
115
|
+
)
|
|
94
116
|
break
|
|
117
|
+
|
|
95
118
|
if not success:
|
|
96
|
-
raise
|
|
119
|
+
raise CompilationError(f"Failed to take a screenshot: {error}")
|
|
97
120
|
|
|
98
121
|
# Stop the server
|
|
99
122
|
server.stop()
|
|
@@ -128,7 +151,7 @@ class WebpageScenario(Image2StructureScenario):
|
|
|
128
151
|
)
|
|
129
152
|
|
|
130
153
|
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
|
|
131
|
-
SUBSETS = ["css", "html", "javascript"]
|
|
154
|
+
SUBSETS = ["css", "html", "javascript", "wild", "wild_legacy"]
|
|
132
155
|
MAX_TRIES: int = 5
|
|
133
156
|
ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
|
|
134
157
|
|
|
@@ -140,9 +163,10 @@ class WebpageScenario(Image2StructureScenario):
|
|
|
140
163
|
subset: str,
|
|
141
164
|
recompile_prompt: bool = True,
|
|
142
165
|
split: str = VALID_SPLIT,
|
|
166
|
+
difficulty: str = DIFFICULTY_ALL,
|
|
143
167
|
screenshot_options: ScreenshotOptions = ScreenshotOptions(),
|
|
144
168
|
):
|
|
145
|
-
super().__init__(subset, recompile_prompt, split)
|
|
169
|
+
super().__init__(subset, recompile_prompt, split, difficulty)
|
|
146
170
|
self._screenshot_options = screenshot_options
|
|
147
171
|
self._html2text = HTML2Text()
|
|
148
172
|
self._html2text.ignore_links = True
|
|
@@ -165,6 +189,13 @@ class WebpageScenario(Image2StructureScenario):
|
|
|
165
189
|
shutil.rmtree(assets_save_path)
|
|
166
190
|
ensure_directory_exists(assets_save_path)
|
|
167
191
|
|
|
192
|
+
if "wild" in self._subset:
|
|
193
|
+
# There is no stucture
|
|
194
|
+
del row["assets"]
|
|
195
|
+
row["assets_paths"] = []
|
|
196
|
+
row["assets_names"] = []
|
|
197
|
+
return row
|
|
198
|
+
|
|
168
199
|
# Structure is a base64 encoding of the repo
|
|
169
200
|
if self._output_path is None:
|
|
170
201
|
raise ValueError("Output path not set")
|
|
@@ -51,7 +51,7 @@ class MathVistaScenario(Scenario):
|
|
|
51
51
|
name = "math_vista"
|
|
52
52
|
description = (
|
|
53
53
|
"A benchmark designed to combine challenges from diverse mathematical and visual tasks. "
|
|
54
|
-
"([
|
|
54
|
+
"([Lu et al., 2024](https://arxiv.org/abs/2310.02255))."
|
|
55
55
|
)
|
|
56
56
|
tags = ["vision-language", "reasoning", "math"]
|
|
57
57
|
|
|
@@ -38,10 +38,10 @@ class MementosScenario(Scenario):
|
|
|
38
38
|
Paper: https://arxiv.org/abs/2401.10529
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
|
-
MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "
|
|
41
|
+
MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "RussWang96/unofficial_mementos_dataset"
|
|
42
42
|
|
|
43
43
|
IMAGE_URL: str = (
|
|
44
|
-
"https://huggingface.co/datasets/
|
|
44
|
+
"https://huggingface.co/datasets/RussWang96/unofficial_mementos_dataset/resolve/main/"
|
|
45
45
|
+ "{subject}/{split}/{file_name}?download=true"
|
|
46
46
|
)
|
|
47
47
|
|
|
@@ -56,7 +56,7 @@ class MementosScenario(Scenario):
|
|
|
56
56
|
name = "mementos"
|
|
57
57
|
description = (
|
|
58
58
|
"A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
|
|
59
|
-
" ([
|
|
59
|
+
" ([Wang et al., 2024](https://arxiv.org/abs/2401.10529))."
|
|
60
60
|
)
|
|
61
61
|
tags = ["vision-language"]
|
|
62
62
|
|
|
@@ -48,14 +48,14 @@ class MMSafetyBenchScenario(Scenario):
|
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
QUESTIONS_URL_TEMPLATE: str = (
|
|
51
|
-
"https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/
|
|
51
|
+
"https://raw.githubusercontent.com/isXinLiu/MM-SafetyBench/main/data/processed_questions/{dataset}.json"
|
|
52
52
|
)
|
|
53
53
|
IMAGES_URL: str = "https://drive.google.com/uc?export=download&id=1xjW9k-aGkmwycqGCXbru70FaSKhSDcR_"
|
|
54
54
|
|
|
55
55
|
name = "mm_safety_bench"
|
|
56
56
|
description = (
|
|
57
57
|
"Expose the vulnerability of open-source VLMs with toxic and biased content "
|
|
58
|
-
"([
|
|
58
|
+
"([Liu et al., 2023](https://arxiv.org/abs/2311.17600))."
|
|
59
59
|
)
|
|
60
60
|
tags = ["vision-language", "bias", "toxicity"]
|
|
61
61
|
|
|
@@ -19,22 +19,22 @@ from helm.common.general import ensure_directory_exists
|
|
|
19
19
|
|
|
20
20
|
class MMEScenario(Scenario):
|
|
21
21
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
22
|
+
MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
|
|
23
|
+
|
|
24
|
+
Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
|
|
25
|
+
multimodal tasks, showing amazing emergent abilities in recent studies. However,
|
|
26
|
+
it is difficult for these case studies to fully reflect the performance of MLLM,
|
|
27
|
+
lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
|
|
28
|
+
the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
|
|
29
|
+
and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
|
|
30
|
+
that may arise from direct use of public datasets for evaluation, the annotations
|
|
31
|
+
of instruction-answer pairs are all manually designed. The concise instruction design
|
|
32
|
+
allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
|
|
33
|
+
Besides, with such an instruction, we can also easily carry out quantitative
|
|
34
|
+
statistics. We rephrase the answer type of MME to multiple-choice question-answering.
|
|
35
|
+
We use the multiple-choice metrics for 14 different evaluation tasks.
|
|
36
|
+
|
|
37
|
+
@article{fu2023mme,
|
|
38
38
|
title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
|
|
39
39
|
author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
|
|
40
40
|
Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
|
|
@@ -43,7 +43,7 @@ class MMEScenario(Scenario):
|
|
|
43
43
|
year={2023}
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
Paper: https://arxiv.org/abs/2306.13394
|
|
47
47
|
"""
|
|
48
48
|
|
|
49
49
|
MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
|
|
@@ -66,7 +66,10 @@ class MMEScenario(Scenario):
|
|
|
66
66
|
]
|
|
67
67
|
|
|
68
68
|
name = "mme"
|
|
69
|
-
description =
|
|
69
|
+
description = (
|
|
70
|
+
"Evaluate multimodal models on their perception and cognition abilities on a total of 14 subtasks "
|
|
71
|
+
"([Fu et al., 2023](https://arxiv.org/abs/2306.13394))."
|
|
72
|
+
)
|
|
70
73
|
tags = ["vision-language"]
|
|
71
74
|
options: List[str] = ["Yes", "No"]
|
|
72
75
|
|
|
@@ -81,7 +81,7 @@ class MMMUScenario(Scenario):
|
|
|
81
81
|
name = "mmmu"
|
|
82
82
|
description = (
|
|
83
83
|
"Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
|
|
84
|
-
"subject knowledge and deliberate reasoning ([
|
|
84
|
+
"subject knowledge and deliberate reasoning ([Yue et al., 2023](https://arxiv.org/abs/2311.16502))."
|
|
85
85
|
)
|
|
86
86
|
tags = ["vision-language"]
|
|
87
87
|
|
|
@@ -19,7 +19,7 @@ class PAIRSScenario(Scenario):
|
|
|
19
19
|
"""
|
|
20
20
|
Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
|
|
21
21
|
|
|
22
|
-
Modified to
|
|
22
|
+
Modified to add an option to opt-out with "unclear" as a choice.
|
|
23
23
|
|
|
24
24
|
@misc{fraser2024examining,
|
|
25
25
|
title={Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel
|
|
@@ -186,7 +186,7 @@ class PAIRSScenario(Scenario):
|
|
|
186
186
|
name = "pairs"
|
|
187
187
|
description = (
|
|
188
188
|
"Examining gender and racial bias in VLMs Using a Novel Dataset of Parallel Images. "
|
|
189
|
-
"([
|
|
189
|
+
"([Fraser et al., 2024](https://arxiv.org/abs/2402.05779))."
|
|
190
190
|
)
|
|
191
191
|
tags = ["vision-language", "bias"]
|
|
192
192
|
|
|
@@ -232,13 +232,14 @@ class PAIRSScenario(Scenario):
|
|
|
232
232
|
MediaObject(location=local_image_path, content_type="image/png"),
|
|
233
233
|
MediaObject(text=question.text, content_type="text/plain"),
|
|
234
234
|
]
|
|
235
|
+
references = [Reference(Output(text=choice), tags=[]) for i, choice in enumerate(question.choices)]
|
|
236
|
+
# Add the preferred choice "unclear" as the correct answer
|
|
237
|
+
references.append(Reference(Output(text="unclear"), tags=[CORRECT_TAG]))
|
|
238
|
+
|
|
235
239
|
instances.append(
|
|
236
240
|
Instance(
|
|
237
241
|
Input(multimedia_content=MultimediaObject(content)),
|
|
238
|
-
references=
|
|
239
|
-
Reference(Output(text=choice), tags=[CORRECT_TAG] if i == question.preferred_choice else [])
|
|
240
|
-
for i, choice in enumerate(question.choices)
|
|
241
|
-
],
|
|
242
|
+
references=references,
|
|
242
243
|
split=TEST_SPLIT,
|
|
243
244
|
)
|
|
244
245
|
)
|
|
@@ -42,7 +42,8 @@ class POPEScenario(Scenario):
|
|
|
42
42
|
|
|
43
43
|
name = "pope"
|
|
44
44
|
description = (
|
|
45
|
-
"Open-ended questions about hallucination images
|
|
45
|
+
"Open-ended questions about hallucination images "
|
|
46
|
+
"([Li et al., 2023](https://aclanthology.org/2023.emnlp-main.20/))."
|
|
46
47
|
)
|
|
47
48
|
tags = ["vision-language", "visual question answering"]
|
|
48
49
|
options: List[str] = ["Yes", "No"]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.images_utils import generate_hash
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RealWorldQAScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
RealWorldQA is a benchmark designed for real-world understanding. The dataset consists of anonymized
|
|
23
|
+
images taken from vehicles, in addition to other real-world images.
|
|
24
|
+
|
|
25
|
+
Blog post: https://x.ai/blog/grok-1.5v
|
|
26
|
+
Website: https://huggingface.co/datasets/xai-org/RealworldQA
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
HUGGINGFACE_DATASET_NAME: str = "xai-org/RealworldQA"
|
|
30
|
+
|
|
31
|
+
name = "real_world_qa"
|
|
32
|
+
description = (
|
|
33
|
+
"A benchmark designed to to evaluate real-world spatial understanding capabilities of multimodal models "
|
|
34
|
+
"([xAI, 2024](https://x.ai/blog/grok-1.5v))."
|
|
35
|
+
)
|
|
36
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
37
|
+
|
|
38
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
39
|
+
instances: List[Instance] = []
|
|
40
|
+
for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split=TEST_SPLIT, cache_dir=output_path)):
|
|
41
|
+
# Save the image to disk
|
|
42
|
+
image = row["image"]
|
|
43
|
+
image_file_name: str = generate_hash(image) + ".jpg"
|
|
44
|
+
local_image_path: str = os.path.join(output_path, image_file_name)
|
|
45
|
+
if not os.path.exists(local_image_path):
|
|
46
|
+
image.save(local_image_path)
|
|
47
|
+
|
|
48
|
+
content: List[MediaObject] = [
|
|
49
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
50
|
+
MediaObject(text=row["question"], content_type="text/plain"),
|
|
51
|
+
]
|
|
52
|
+
references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
|
|
53
|
+
instances.append(
|
|
54
|
+
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=TEST_SPLIT)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return instances
|
|
@@ -35,10 +35,10 @@ class SEEDBenchScenario(Scenario):
|
|
|
35
35
|
the multiple-choice metric for evaluating the performance of models.
|
|
36
36
|
|
|
37
37
|
@article{li2023seed,
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
38
|
+
title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
|
|
39
|
+
author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
|
|
40
|
+
journal={arXiv preprint arXiv:2307.16125},
|
|
41
|
+
year={2023}
|
|
42
42
|
}
|
|
43
43
|
|
|
44
44
|
Paper: https://arxiv.org/abs/2307.16125
|
|
@@ -59,7 +59,9 @@ class SEEDBenchScenario(Scenario):
|
|
|
59
59
|
}
|
|
60
60
|
|
|
61
61
|
name = "seed_bench"
|
|
62
|
-
description =
|
|
62
|
+
description = (
|
|
63
|
+
"Evaluate multimodal models on 9 evaluation aspects " "([Li et al., 2023](https://arxiv.org/abs/2307.16125))."
|
|
64
|
+
)
|
|
63
65
|
tags = ["vision-language"]
|
|
64
66
|
|
|
65
67
|
def __init__(self, subject: str):
|