crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -39,90 +39,91 @@ class AdapterSpec:
|
|
|
39
39
|
Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
|
-
# Method of adaptation
|
|
43
42
|
method: str = ""
|
|
43
|
+
"""The high-level strategy for converting instances into a prompt for the language model."""
|
|
44
44
|
|
|
45
|
-
# Prepend all prompts with this string.
|
|
46
|
-
# For example, it is recommended to prefix all prompts with [NLG] for UL2.
|
|
47
45
|
global_prefix: str = ""
|
|
46
|
+
"""The string that is prepended to the entire prompt."""
|
|
48
47
|
|
|
49
|
-
# Append all prompts with this string.
|
|
50
48
|
global_suffix: str = ""
|
|
49
|
+
"""The string that is appended to the entire prompt."""
|
|
51
50
|
|
|
52
|
-
# Prompt starts with instructions
|
|
53
51
|
instructions: str = ""
|
|
52
|
+
"""The description of the task that is included at the very beginning of the prompt."""
|
|
54
53
|
|
|
55
|
-
# What goes before the input
|
|
56
54
|
input_prefix: str = "Input: "
|
|
55
|
+
"""The string that is included before each input (e.g., 'Question:')."""
|
|
57
56
|
|
|
58
|
-
# What goes after the input
|
|
59
57
|
input_suffix: str = "\n"
|
|
58
|
+
"""The string that is included after each input (e.g., '\\n')."""
|
|
60
59
|
|
|
61
|
-
# What goes before the input (for multiple choice)
|
|
62
60
|
reference_prefix: str = "A. "
|
|
61
|
+
"""The string that is included before each reference (for multiple-choice questions)."""
|
|
63
62
|
|
|
64
|
-
# What goes before the input (for multiple choice)
|
|
65
63
|
reference_suffix: str = "\n"
|
|
64
|
+
"""The string that is included after each reference (for multiple-choice questions)."""
|
|
66
65
|
|
|
67
|
-
# What goes before the output
|
|
68
66
|
output_prefix: str = "Output: "
|
|
67
|
+
"""The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
|
|
69
68
|
|
|
70
|
-
# What goes after the output
|
|
71
69
|
output_suffix: str = "\n"
|
|
70
|
+
"""The string that is included after the correct answer/predicted output (e.g., '\\n')."""
|
|
72
71
|
|
|
73
|
-
# What goes between instruction and in-context example blocks in the constructed prompt
|
|
74
72
|
instance_prefix: str = "\n"
|
|
73
|
+
"""The string that is included before each instance (e.g., '\\n\\n')."""
|
|
75
74
|
|
|
76
|
-
# List of regular expression substitutions that we perform
|
|
77
75
|
substitutions: List[Substitution] = field(default_factory=list, hash=False)
|
|
76
|
+
"""A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
|
|
77
|
+
to perform at the very end on the prompt."""
|
|
78
78
|
|
|
79
|
-
# Maximum number of (in-context) training instances to put into the prompt
|
|
80
79
|
max_train_instances: int = 5
|
|
80
|
+
"""Maximum number of training instances to include in the prompt (currently by randomly sampling)."""
|
|
81
81
|
|
|
82
|
-
# Maximum number of evaluation instances. For getting valid numbers, this
|
|
83
|
-
# should be the entire dataset; only reduce this for piloting.
|
|
84
82
|
max_eval_instances: Optional[int] = None
|
|
83
|
+
"""Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""
|
|
85
84
|
|
|
86
|
-
# Generate this many outputs (which could be realized by `num_completions`
|
|
87
|
-
# or `top_k_per_token`).
|
|
88
85
|
num_outputs: int = 5
|
|
86
|
+
"""Maximum number of possible outputs to generate by sampling multiple outputs."""
|
|
89
87
|
|
|
90
|
-
# Number of trials, where in each trial we choose an independent, random
|
|
91
|
-
# set of training instances. Used to compute error bars.
|
|
92
88
|
num_train_trials: int = 1
|
|
89
|
+
"""Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
90
|
+
Used to compute variance."""
|
|
93
91
|
|
|
94
|
-
# Number of trials, where we query the model with the same requests, but different random seeds
|
|
95
92
|
num_trials: int = 1
|
|
93
|
+
"""Number of trials, where we query the model with the same requests, but different random seeds."""
|
|
96
94
|
|
|
97
|
-
# If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
98
95
|
sample_train: bool = True
|
|
96
|
+
"""If true, randomly sample N training examples; if false, select N consecutive training examples"""
|
|
99
97
|
|
|
100
98
|
# Decoding parameters (inherited by `Request`)
|
|
101
99
|
|
|
102
|
-
# Model deployment to make the request to (need to fill in)
|
|
103
100
|
model_deployment: str = ""
|
|
101
|
+
"""Name of the language model deployment (<host_organization>/<model name>) to send requests to."""
|
|
104
102
|
|
|
105
|
-
# Model to make the request to
|
|
106
103
|
model: str = ""
|
|
104
|
+
"""Name of the language model (<creator_organization>/<model name>) to send requests to."""
|
|
107
105
|
|
|
108
|
-
# Temperature to use
|
|
109
106
|
temperature: float = 1
|
|
107
|
+
"""Temperature parameter used in generation."""
|
|
110
108
|
|
|
111
|
-
# Maximum number of tokens to generate
|
|
112
109
|
max_tokens: int = 100
|
|
110
|
+
"""Maximum number of tokens to generate."""
|
|
113
111
|
|
|
114
|
-
#
|
|
112
|
+
# Set hash=False to make `AdapterSpec` hashable
|
|
115
113
|
stop_sequences: List[str] = field(default_factory=list, hash=False)
|
|
114
|
+
"""List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""
|
|
116
115
|
|
|
117
116
|
# Random string (used concretely to bypass cache / see diverse results)
|
|
118
117
|
random: Optional[str] = None
|
|
118
|
+
"""Random seed (string), which guarantees reproducibility."""
|
|
119
119
|
|
|
120
|
-
# If true, for instances with multiple correct reference, the gold answer should be considered
|
|
121
|
-
# to be all the correct references rather than any of the correct references.
|
|
122
120
|
multi_label: bool = False
|
|
121
|
+
"""If true, for instances with multiple correct reference, the gold answer should be considered to be all
|
|
122
|
+
of the correct references rather than any of the correct references."""
|
|
123
123
|
|
|
124
|
-
# Parameters for image generation
|
|
125
124
|
image_generation_parameters: Optional[ImageGenerationParameters] = None
|
|
125
|
+
"""Parameters for image generation."""
|
|
126
126
|
|
|
127
|
-
#
|
|
127
|
+
# Set hash=False to make `AdapterSpec` hashable
|
|
128
128
|
eval_splits: Optional[List[str]] = field(default=None, hash=False)
|
|
129
|
+
"""The splits from which evaluation instances will be drawn."""
|
|
@@ -79,6 +79,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
|
|
|
79
79
|
# Prompt
|
|
80
80
|
prompt = MultimodalPrompt(
|
|
81
81
|
global_prefix=self.adapter_spec.global_prefix,
|
|
82
|
+
global_suffix=self.adapter_spec.global_suffix,
|
|
82
83
|
instructions=self.adapter_spec.instructions,
|
|
83
84
|
train_instance_blocks=train_instance_blocks,
|
|
84
85
|
eval_instance_block=eval_instance_block,
|
|
@@ -11,6 +11,9 @@ class MultimodalPrompt:
|
|
|
11
11
|
# Global prefix, carried over from `AdapterSpec`
|
|
12
12
|
global_prefix: str
|
|
13
13
|
|
|
14
|
+
# Global suffix, carried over from `AdapterSpec`
|
|
15
|
+
global_suffix: str
|
|
16
|
+
|
|
14
17
|
# Instance prefix, carried over from `AdapterSpec`. What goes between the instruction and instances.
|
|
15
18
|
instance_prefix: str
|
|
16
19
|
|
|
@@ -47,6 +50,10 @@ class MultimodalPrompt:
|
|
|
47
50
|
if self.global_prefix:
|
|
48
51
|
result = result.add_textual_prefix(self.global_prefix)
|
|
49
52
|
|
|
53
|
+
# Add the global prefix if one exists
|
|
54
|
+
if self.global_suffix:
|
|
55
|
+
result = result.add_textual_suffix(self.global_suffix)
|
|
56
|
+
|
|
50
57
|
return result
|
|
51
58
|
|
|
52
59
|
@property
|
|
@@ -32,6 +32,7 @@ class TestMultimodalContent(unittest.TestCase):
|
|
|
32
32
|
|
|
33
33
|
prompt = MultimodalPrompt(
|
|
34
34
|
global_prefix="[START]",
|
|
35
|
+
global_suffix="",
|
|
35
36
|
instance_prefix="\n",
|
|
36
37
|
instructions="Please answer the following questions about the images.",
|
|
37
38
|
train_instance_blocks=train_instance_blocks,
|
|
@@ -67,6 +68,7 @@ class TestMultimodalContent(unittest.TestCase):
|
|
|
67
68
|
|
|
68
69
|
prompt = MultimodalPrompt(
|
|
69
70
|
global_prefix="",
|
|
71
|
+
global_suffix="",
|
|
70
72
|
instance_prefix="\n",
|
|
71
73
|
instructions="",
|
|
72
74
|
train_instance_blocks=[],
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
|
+
from helm.benchmark.annotation.annotator import Annotator
|
|
9
|
+
from helm.clients.auto_client import AutoClient
|
|
10
|
+
from helm.common.request import Request
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AIRBench2024Annotator(Annotator):
|
|
14
|
+
"""The AIRBench 2024 autograder."""
|
|
15
|
+
|
|
16
|
+
name = "air_bench_2024"
|
|
17
|
+
|
|
18
|
+
def __init__(self, auto_client: AutoClient, file_storage_path: str):
|
|
19
|
+
self._auto_client = auto_client
|
|
20
|
+
cache_dir = os.path.join(file_storage_path, "data")
|
|
21
|
+
ensure_directory_exists(cache_dir)
|
|
22
|
+
dataset = datasets.load_dataset(
|
|
23
|
+
"stanford-crfm/air-bench-2024", "judge_prompts", split="test", cache_dir=cache_dir
|
|
24
|
+
)
|
|
25
|
+
self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
|
|
26
|
+
# Regex pattern is lenient to allow for typos e.g. extra whitespace
|
|
27
|
+
self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
28
|
+
|
|
29
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
30
|
+
assert request_state.result
|
|
31
|
+
assert len(request_state.result.completions) == 1
|
|
32
|
+
model_input_text = request_state.request.prompt
|
|
33
|
+
model_output_text = request_state.result.completions[0].text
|
|
34
|
+
if not model_output_text.strip():
|
|
35
|
+
return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
|
|
36
|
+
category_id = request_state.instance.references[0].output.text
|
|
37
|
+
prompt_template = self._category_id_to_judge_prompt[category_id]
|
|
38
|
+
# Strip to deal with incorrectly formatted input CSV.
|
|
39
|
+
# TODO: Stop stripping after CSV is fixed.
|
|
40
|
+
annotator_prompt = prompt_template.replace("{{QUESTION}}", model_input_text).replace(
|
|
41
|
+
"{{ANSWER}}", model_output_text
|
|
42
|
+
)
|
|
43
|
+
annotator_request = Request(
|
|
44
|
+
model="openai/gpt-4o-2024-05-13",
|
|
45
|
+
model_deployment="openai/gpt-4o-2024-05-13",
|
|
46
|
+
prompt=annotator_prompt,
|
|
47
|
+
temperature=0.0,
|
|
48
|
+
max_tokens=64,
|
|
49
|
+
)
|
|
50
|
+
annotator_response = self._auto_client.make_request(annotator_request)
|
|
51
|
+
if not annotator_response.success:
|
|
52
|
+
raise Exception(f"Annotation request failed: {annotator_response.error}")
|
|
53
|
+
assert len(annotator_response.completions) == 1
|
|
54
|
+
annotator_response_text = annotator_response.completions[0].text
|
|
55
|
+
annotator_response_parts = self._pattern.search(annotator_response_text)
|
|
56
|
+
if not annotator_response_parts:
|
|
57
|
+
raise Exception(f"Malformed annotator response: {annotator_response_text}")
|
|
58
|
+
reasoning = annotator_response_parts[1].strip()
|
|
59
|
+
try:
|
|
60
|
+
score = float(annotator_response_parts[2].strip())
|
|
61
|
+
except ValueError as e:
|
|
62
|
+
raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
|
|
63
|
+
|
|
64
|
+
return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Any, Dict, Mapping, Optional
|
|
3
3
|
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
4
5
|
from helm.common.credentials_utils import provide_api_key
|
|
5
6
|
from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
@@ -46,6 +47,11 @@ class AnnotatorFactory:
|
|
|
46
47
|
provider_bindings={
|
|
47
48
|
"api_key": lambda: provide_api_key(self.credentials, annotator_name),
|
|
48
49
|
"file_storage_path": lambda: self._get_file_storage_path(annotator_name),
|
|
50
|
+
"auto_client": lambda: AutoClient(
|
|
51
|
+
credentials=self.credentials,
|
|
52
|
+
file_storage_path=self.file_storage_path,
|
|
53
|
+
cache_backend_config=self.cache_backend_config,
|
|
54
|
+
),
|
|
49
55
|
},
|
|
50
56
|
)
|
|
51
57
|
annotator = create_object(annotator_spec)
|
|
@@ -18,7 +18,7 @@ class LilypondCompilerAnnotator(ImageCompilerAnnotator):
|
|
|
18
18
|
"""Annotator that compiles the text completions into a music sheet with LilyPond."""
|
|
19
19
|
|
|
20
20
|
name: str = "lilypond_compiler"
|
|
21
|
-
base_path = "
|
|
21
|
+
base_path = "lilypond-2.24.3/bin"
|
|
22
22
|
|
|
23
23
|
def __init__(self, cache_config: CacheConfig, file_storage_path: str):
|
|
24
24
|
super().__init__(cache_config, file_storage_path)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_directory_exists
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.annotation.annotator import Annotator
|
|
8
|
+
from helm.clients.auto_client import AutoClient
|
|
9
|
+
from helm.common.request import Request
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LiveQAAnnotator(Annotator):
|
|
13
|
+
"""The LiveQA autograder."""
|
|
14
|
+
|
|
15
|
+
name = "live_qa"
|
|
16
|
+
|
|
17
|
+
def __init__(self, auto_client: AutoClient, file_storage_path: str):
|
|
18
|
+
self._auto_client = auto_client
|
|
19
|
+
cache_dir = os.path.join(file_storage_path, "data")
|
|
20
|
+
ensure_directory_exists(cache_dir)
|
|
21
|
+
# Regex pattern is lenient to allow for typos e.g. extra whitespace
|
|
22
|
+
self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
23
|
+
|
|
24
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
25
|
+
assert request_state.result
|
|
26
|
+
assert len(request_state.result.completions) == 1
|
|
27
|
+
model_input_text = request_state.request.prompt
|
|
28
|
+
model_output_text = request_state.result.completions[0].text
|
|
29
|
+
if not model_output_text.strip():
|
|
30
|
+
return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
|
|
31
|
+
correct_responses = ""
|
|
32
|
+
for i, reference in enumerate(request_state.instance.references):
|
|
33
|
+
correct_responses += "Correct Response #" + str(i) + ": " + reference.output.text + "\n"
|
|
34
|
+
prompt_template = """You are a medical judge will be given a user request and a model response to that request. Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
|
|
35
|
+
The user's request will be provided in these tags:
|
|
36
|
+
<user_request>
|
|
37
|
+
{{QUESTION}}
|
|
38
|
+
</user_request>
|
|
39
|
+
The model's response will be provided in these tags:
|
|
40
|
+
<model_response>
|
|
41
|
+
{{ANSWER}}
|
|
42
|
+
</model_response>
|
|
43
|
+
Some potential correct responses will be provided in these tags:
|
|
44
|
+
<correct_responses>
|
|
45
|
+
{{CORRECT_RESPONSES}}
|
|
46
|
+
</correct_responses>
|
|
47
|
+
Carefully analyze the <model_response> and determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
|
|
48
|
+
# 1, if the model response fully matches the correct response content and intent for at least one correct response
|
|
49
|
+
# 0.7, if the model response contains correct information but is incomplete relative to the correct response(s)
|
|
50
|
+
# 0.3, if the model response contains some correct information but is incomplete and/or contains incorrect information relative to the correct response(s)
|
|
51
|
+
# 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
|
|
52
|
+
Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag. Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
|
|
53
|
+
Format:
|
|
54
|
+
##short_reasoning: <your concise and brief reasoning>
|
|
55
|
+
##the_score: <your score>""" # noqa: E501
|
|
56
|
+
annotator_prompt = (
|
|
57
|
+
prompt_template.strip('"')
|
|
58
|
+
.strip()
|
|
59
|
+
.replace("{{QUESTION}}", model_input_text)
|
|
60
|
+
.replace("{{ANSWER}}", model_output_text)
|
|
61
|
+
.replace("{{CORRECT_RESPONSES}}", correct_responses)
|
|
62
|
+
)
|
|
63
|
+
annotator_request = Request(
|
|
64
|
+
model="openai/gpt-4-turbo-2024-04-09",
|
|
65
|
+
model_deployment="openai/gpt-4-turbo-2024-04-09",
|
|
66
|
+
prompt=annotator_prompt,
|
|
67
|
+
temperature=0.0,
|
|
68
|
+
max_tokens=64,
|
|
69
|
+
)
|
|
70
|
+
annotator_response = self._auto_client.make_request(annotator_request)
|
|
71
|
+
if not annotator_response.success:
|
|
72
|
+
raise Exception(f"Annotation request failed: {annotator_response.error}")
|
|
73
|
+
assert len(annotator_response.completions) == 1
|
|
74
|
+
annotator_response_text = annotator_response.completions[0].text
|
|
75
|
+
annotator_response_parts = self._pattern.search(annotator_response_text)
|
|
76
|
+
if not annotator_response_parts:
|
|
77
|
+
raise Exception(f"Malformed annotator response: {annotator_response_text}")
|
|
78
|
+
reasoning = annotator_response_parts[1].strip()
|
|
79
|
+
try:
|
|
80
|
+
score = float(annotator_response_parts[2].strip())
|
|
81
|
+
except ValueError as e:
|
|
82
|
+
raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
|
|
83
|
+
|
|
84
|
+
return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_directory_exists
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.annotation.annotator import Annotator
|
|
8
|
+
from helm.clients.auto_client import AutoClient
|
|
9
|
+
from helm.common.request import Request
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MedicationQAAnnotator(Annotator):
|
|
13
|
+
"""The LiveQA autograder."""
|
|
14
|
+
|
|
15
|
+
name = "medication_qa"
|
|
16
|
+
|
|
17
|
+
def __init__(self, auto_client: AutoClient, file_storage_path: str):
|
|
18
|
+
self._auto_client = auto_client
|
|
19
|
+
cache_dir = os.path.join(file_storage_path, "data")
|
|
20
|
+
ensure_directory_exists(cache_dir)
|
|
21
|
+
# Regex pattern is lenient to allow for typos e.g. extra whitespace
|
|
22
|
+
self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
23
|
+
|
|
24
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
25
|
+
assert request_state.result
|
|
26
|
+
assert len(request_state.result.completions) == 1
|
|
27
|
+
model_input_text = request_state.request.prompt
|
|
28
|
+
model_output_text = request_state.result.completions[0].text
|
|
29
|
+
if not model_output_text.strip():
|
|
30
|
+
return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
|
|
31
|
+
correct_response = request_state.instance.references[0].output.text
|
|
32
|
+
prompt_template = """You are a medical judge will be given a user request and a model response to that request. Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
|
|
33
|
+
The user's request will be provided in these tags:
|
|
34
|
+
<user_request>
|
|
35
|
+
{{QUESTION}}
|
|
36
|
+
</user_request>
|
|
37
|
+
The model's response will be provided in these tags:
|
|
38
|
+
<model_response>
|
|
39
|
+
{{ANSWER}}
|
|
40
|
+
</model_response>
|
|
41
|
+
Some potential correct responses will be provided in these tags:
|
|
42
|
+
<correct_response>
|
|
43
|
+
{{CORRECT_RESPONSE}}
|
|
44
|
+
</correct_response>
|
|
45
|
+
Carefully analyze the <model_response> and determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
|
|
46
|
+
# 1, if the model response fully matches the correct response content and intent for at least one correct response
|
|
47
|
+
# 0.5, if the model response contains some correct information but is incomplete and/or contains incorrect information relative to the correct response(s)
|
|
48
|
+
# 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
|
|
49
|
+
Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag. Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
|
|
50
|
+
Format:
|
|
51
|
+
##short_reasoning: <your concise and brief reasoning>
|
|
52
|
+
##the_score: <your score>""" # noqa: E501
|
|
53
|
+
annotator_prompt = (
|
|
54
|
+
prompt_template.strip('"')
|
|
55
|
+
.strip()
|
|
56
|
+
.replace("{{QUESTION}}", model_input_text)
|
|
57
|
+
.replace("{{ANSWER}}", model_output_text)
|
|
58
|
+
.replace("{{CORRECT_RESPONSE}}", correct_response)
|
|
59
|
+
)
|
|
60
|
+
annotator_request = Request(
|
|
61
|
+
model="openai/gpt-4-turbo-2024-04-09",
|
|
62
|
+
model_deployment="openai/gpt-4-turbo-2024-04-09",
|
|
63
|
+
prompt=annotator_prompt,
|
|
64
|
+
temperature=0.0,
|
|
65
|
+
max_tokens=64,
|
|
66
|
+
)
|
|
67
|
+
annotator_response = self._auto_client.make_request(annotator_request)
|
|
68
|
+
if not annotator_response.success:
|
|
69
|
+
raise Exception(f"Annotation request failed: {annotator_response.error}")
|
|
70
|
+
assert len(annotator_response.completions) == 1
|
|
71
|
+
annotator_response_text = annotator_response.completions[0].text
|
|
72
|
+
annotator_response_parts = self._pattern.search(annotator_response_text)
|
|
73
|
+
if not annotator_response_parts:
|
|
74
|
+
raise Exception(f"Malformed annotator response: {annotator_response_text}")
|
|
75
|
+
reasoning = annotator_response_parts[1].strip()
|
|
76
|
+
try:
|
|
77
|
+
score = float(annotator_response_parts[2].strip())
|
|
78
|
+
except ValueError as e:
|
|
79
|
+
raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
|
|
80
|
+
|
|
81
|
+
return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
|
|
@@ -48,11 +48,27 @@ class TextPerturbation(Perturbation, ABC):
|
|
|
48
48
|
|
|
49
49
|
description = replace(self.description, seed=seed)
|
|
50
50
|
|
|
51
|
+
perturbed_input: Input
|
|
52
|
+
if instance.input.multimedia_content:
|
|
53
|
+
perturbed_media_objects = []
|
|
54
|
+
for media_object in instance.input.multimedia_content.media_objects:
|
|
55
|
+
# Apply perturbations to the text data of the multimedia content
|
|
56
|
+
if media_object.is_type("text") and media_object.text is not None:
|
|
57
|
+
perturbed_media_objects.append(replace(media_object, text=self.perturb(media_object.text, rng)))
|
|
58
|
+
else:
|
|
59
|
+
perturbed_media_objects.append(media_object)
|
|
60
|
+
|
|
61
|
+
perturbed_input = Input(
|
|
62
|
+
multimedia_content=replace(instance.input.multimedia_content, media_objects=perturbed_media_objects)
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
perturbed_input = Input(text=self.perturb(instance.input.text, rng))
|
|
66
|
+
|
|
51
67
|
# Don't modify `id` of `Instance` here.
|
|
52
68
|
# All the perturbed Instances generated from a single Instance should have the same ID.
|
|
53
69
|
return replace(
|
|
54
70
|
instance,
|
|
55
|
-
input=
|
|
71
|
+
input=perturbed_input,
|
|
56
72
|
references=references,
|
|
57
73
|
perturbation=description,
|
|
58
74
|
contrast_inputs=[instance.input],
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from typing import List
|
|
3
3
|
import unittest
|
|
4
4
|
|
|
5
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
5
6
|
from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
|
|
6
7
|
from .data_augmenter import DataAugmenter
|
|
7
8
|
from .extra_space_perturbation import ExtraSpacePerturbation
|
|
@@ -33,6 +34,35 @@ def test_extra_space_perturbation():
|
|
|
33
34
|
assert instances[1].references[0].output.text == "some name"
|
|
34
35
|
|
|
35
36
|
|
|
37
|
+
def test_multimodal_text_perturbation():
|
|
38
|
+
data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
|
|
39
|
+
input: Input = Input(
|
|
40
|
+
multimedia_content=MultimediaObject(
|
|
41
|
+
[
|
|
42
|
+
MediaObject(text="Hello what is", content_type="text/plain"),
|
|
43
|
+
MediaObject(text="your name", content_type="text/plain"),
|
|
44
|
+
]
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
|
|
48
|
+
instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
|
|
49
|
+
|
|
50
|
+
assert len(instances) == 2
|
|
51
|
+
|
|
52
|
+
# Test that the first instance is unperturbed
|
|
53
|
+
assert instances[0].id == "id0"
|
|
54
|
+
assert instances[0].perturbation is None
|
|
55
|
+
media_objects = instances[0].input.multimedia_content.media_objects
|
|
56
|
+
assert media_objects[0].text == "Hello what is"
|
|
57
|
+
assert media_objects[1].text == "your name"
|
|
58
|
+
|
|
59
|
+
assert instances[1].id == "id0"
|
|
60
|
+
assert instances[1].perturbation.name == "extra_space"
|
|
61
|
+
media_objects = instances[1].input.multimedia_content.media_objects
|
|
62
|
+
assert media_objects[0].text == "Hello what is"
|
|
63
|
+
assert media_objects[1].text == "your name"
|
|
64
|
+
|
|
65
|
+
|
|
36
66
|
def test_misspelling_perturbation():
|
|
37
67
|
data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
|
|
38
68
|
instance: Instance = Instance(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Optional
|
|
2
|
+
from typing import Optional, Dict, Union
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.model_deployment_registry import (
|
|
5
5
|
ClientSpec,
|
|
@@ -17,14 +17,22 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def register_huggingface_model(
|
|
20
|
-
helm_model_name: str,
|
|
20
|
+
helm_model_name: str,
|
|
21
|
+
pretrained_model_name_or_path: str,
|
|
22
|
+
revision: Optional[str] = None,
|
|
23
|
+
openvino: Optional[bool] = False,
|
|
21
24
|
) -> None:
|
|
22
|
-
object_spec_args = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
|
|
25
|
+
object_spec_args: Dict[str, Union[str, bool]] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
|
|
23
26
|
if revision:
|
|
24
27
|
object_spec_args["revision"] = revision
|
|
28
|
+
if openvino:
|
|
29
|
+
object_spec_args["openvino"] = openvino
|
|
25
30
|
|
|
26
31
|
# Auto-infer model properties from the tokenizer.
|
|
27
|
-
|
|
32
|
+
create_tokenizer_args: Dict[str, str] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
|
|
33
|
+
if revision:
|
|
34
|
+
create_tokenizer_args["revision"] = revision
|
|
35
|
+
with HuggingFaceTokenizer.create_tokenizer(**create_tokenizer_args) as tokenizer:
|
|
28
36
|
max_sequence_length = tokenizer.model_max_length
|
|
29
37
|
end_of_text_token = tokenizer.eos_token or ""
|
|
30
38
|
prefix_token = tokenizer.bos_token or ""
|
|
@@ -71,7 +79,7 @@ def register_huggingface_model(
|
|
|
71
79
|
register_tokenizer_config(tokenizer_config)
|
|
72
80
|
|
|
73
81
|
|
|
74
|
-
def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> None:
|
|
82
|
+
def register_huggingface_hub_model_from_flag_value(raw_model_string: str, openvino=False) -> None:
|
|
75
83
|
raw_model_string_parts = raw_model_string.split("@")
|
|
76
84
|
pretrained_model_name_or_path: str
|
|
77
85
|
revision: Optional[str]
|
|
@@ -88,10 +96,11 @@ def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> Non
|
|
|
88
96
|
helm_model_name=raw_model_string,
|
|
89
97
|
pretrained_model_name_or_path=pretrained_model_name_or_path,
|
|
90
98
|
revision=revision,
|
|
99
|
+
openvino=openvino,
|
|
91
100
|
)
|
|
92
101
|
|
|
93
102
|
|
|
94
|
-
def register_huggingface_local_model_from_flag_value(path: str) -> None:
|
|
103
|
+
def register_huggingface_local_model_from_flag_value(path: str, openvino=False) -> None:
|
|
95
104
|
if not path:
|
|
96
105
|
raise ValueError("Path to Hugging Face model must be non-empty")
|
|
97
106
|
path_parts = os.path.split(path)
|
|
@@ -99,4 +108,5 @@ def register_huggingface_local_model_from_flag_value(path: str) -> None:
|
|
|
99
108
|
register_huggingface_model(
|
|
100
109
|
helm_model_name=helm_model_name,
|
|
101
110
|
pretrained_model_name_or_path=path,
|
|
111
|
+
openvino=openvino,
|
|
102
112
|
)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
|
|
6
|
+
from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
|
|
7
|
+
from helm.benchmark.metrics.metric import Metric
|
|
8
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
9
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
10
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AIRBench2024BasicGenerationMetric(Metric):
|
|
14
|
+
"""Replacement for BasicGenerationMetric for AIRBench 2024.
|
|
15
|
+
|
|
16
|
+
We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
|
|
17
|
+
because we abuse "references" to store metadata rather than true metadata."""
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
super().__init__()
|
|
21
|
+
self.efficiency_metric = EfficiencyMetric()
|
|
22
|
+
|
|
23
|
+
def evaluate_generation(
|
|
24
|
+
self,
|
|
25
|
+
adapter_spec: AdapterSpec,
|
|
26
|
+
request_state: RequestState,
|
|
27
|
+
metric_service: MetricService,
|
|
28
|
+
eval_cache_path: str,
|
|
29
|
+
) -> List[Stat]:
|
|
30
|
+
return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AIRBench2024ScoreMetric(Metric):
|
|
34
|
+
"""Score metrics for AIRBench 2024."""
|
|
35
|
+
|
|
36
|
+
def evaluate_generation(
|
|
37
|
+
self,
|
|
38
|
+
adapter_spec: AdapterSpec,
|
|
39
|
+
request_state: RequestState,
|
|
40
|
+
metric_service: MetricService,
|
|
41
|
+
eval_cache_path: str,
|
|
42
|
+
) -> List[Stat]:
|
|
43
|
+
assert len(request_state.instance.references) > 1
|
|
44
|
+
category_text = request_state.instance.references[0].output.text
|
|
45
|
+
category_parts = category_text.split(".")
|
|
46
|
+
assert len(category_parts) == 3
|
|
47
|
+
assert request_state.annotations
|
|
48
|
+
score = request_state.annotations["air_bench_2024"]["score"]
|
|
49
|
+
return [
|
|
50
|
+
Stat(MetricName("air_score")).add(score),
|
|
51
|
+
Stat(MetricName(f"air_category_{category_parts[0]}_score")).add(score),
|
|
52
|
+
Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_score")).add(score),
|
|
53
|
+
Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_{category_parts[2]}_score")).add(
|
|
54
|
+
score
|
|
55
|
+
),
|
|
56
|
+
]
|
|
@@ -91,8 +91,15 @@ class EfficiencyMetric:
|
|
|
91
91
|
window_service: WindowService = WindowServiceFactory.get_window_service(
|
|
92
92
|
adapter_spec.model_deployment, tokenizer_service
|
|
93
93
|
)
|
|
94
|
-
|
|
95
|
-
|
|
94
|
+
|
|
95
|
+
prompt: str
|
|
96
|
+
num_prompt_tokens: int
|
|
97
|
+
if request_state.request.multimodal_prompt is not None:
|
|
98
|
+
prompt = request_state.request.multimodal_prompt.text
|
|
99
|
+
num_prompt_tokens = window_service.get_num_tokens(prompt)
|
|
100
|
+
else:
|
|
101
|
+
prompt = request_state.request.prompt
|
|
102
|
+
num_prompt_tokens = window_service.get_num_tokens(prompt)
|
|
96
103
|
|
|
97
104
|
# Total number of tokens in the completion.
|
|
98
105
|
num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])
|