crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -39,90 +39,91 @@ class AdapterSpec:
39
39
  Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
40
40
  """
41
41
 
42
- # Method of adaptation
43
42
  method: str = ""
43
+ """The high-level strategy for converting instances into a prompt for the language model."""
44
44
 
45
- # Prepend all prompts with this string.
46
- # For example, it is recommended to prefix all prompts with [NLG] for UL2.
47
45
  global_prefix: str = ""
46
+ """The string that is prepended to the entire prompt."""
48
47
 
49
- # Append all prompts with this string.
50
48
  global_suffix: str = ""
49
+ """The string that is appended to the entire prompt."""
51
50
 
52
- # Prompt starts with instructions
53
51
  instructions: str = ""
52
+ """The description of the task that is included at the very beginning of the prompt."""
54
53
 
55
- # What goes before the input
56
54
  input_prefix: str = "Input: "
55
+ """The string that is included before each input (e.g., 'Question:')."""
57
56
 
58
- # What goes after the input
59
57
  input_suffix: str = "\n"
58
+ """The string that is included after each input (e.g., '\\n')."""
60
59
 
61
- # What goes before the input (for multiple choice)
62
60
  reference_prefix: str = "A. "
61
+ """The string that is included before each reference (for multiple-choice questions)."""
63
62
 
64
- # What goes before the input (for multiple choice)
65
63
  reference_suffix: str = "\n"
64
+ """The string that is included after each reference (for multiple-choice questions)."""
66
65
 
67
- # What goes before the output
68
66
  output_prefix: str = "Output: "
67
+ """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
69
68
 
70
- # What goes after the output
71
69
  output_suffix: str = "\n"
70
+ """The string that is included after the correct answer/predicted output (e.g., '\\n')."""
72
71
 
73
- # What goes between instruction and in-context example blocks in the constructed prompt
74
72
  instance_prefix: str = "\n"
73
+ """The string that is included before each instance (e.g., '\\n\\n')."""
75
74
 
76
- # List of regular expression substitutions that we perform
77
75
  substitutions: List[Substitution] = field(default_factory=list, hash=False)
76
+ """A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
77
+ to perform at the very end on the prompt."""
78
78
 
79
- # Maximum number of (in-context) training instances to put into the prompt
80
79
  max_train_instances: int = 5
80
+ """Maximum number of training instances to include in the prompt (currently by randomly sampling)."""
81
81
 
82
- # Maximum number of evaluation instances. For getting valid numbers, this
83
- # should be the entire dataset; only reduce this for piloting.
84
82
  max_eval_instances: Optional[int] = None
83
+ """Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""
85
84
 
86
- # Generate this many outputs (which could be realized by `num_completions`
87
- # or `top_k_per_token`).
88
85
  num_outputs: int = 5
86
+ """Maximum number of possible outputs to generate by sampling multiple outputs."""
89
87
 
90
- # Number of trials, where in each trial we choose an independent, random
91
- # set of training instances. Used to compute error bars.
92
88
  num_train_trials: int = 1
89
+ """Number of trials, where in each trial we choose an independent, random set of training instances.
90
+ Used to compute variance."""
93
91
 
94
- # Number of trials, where we query the model with the same requests, but different random seeds
95
92
  num_trials: int = 1
93
+ """Number of trials, where we query the model with the same requests, but different random seeds."""
96
94
 
97
- # If true, randomly sample N training examples; if false, select N consecutive training examples
98
95
  sample_train: bool = True
96
+ """If true, randomly sample N training examples; if false, select N consecutive training examples"""
99
97
 
100
98
  # Decoding parameters (inherited by `Request`)
101
99
 
102
- # Model deployment to make the request to (need to fill in)
103
100
  model_deployment: str = ""
101
+ """Name of the language model deployment (<host_organization>/<model name>) to send requests to."""
104
102
 
105
- # Model to make the request to
106
103
  model: str = ""
104
+ """Name of the language model (<creator_organization>/<model name>) to send requests to."""
107
105
 
108
- # Temperature to use
109
106
  temperature: float = 1
107
+ """Temperature parameter used in generation."""
110
108
 
111
- # Maximum number of tokens to generate
112
109
  max_tokens: int = 100
110
+ """Maximum number of tokens to generate."""
113
111
 
114
- # When to stop (set hash=False to make `AdapterSpec` hashable)
112
+ # Set hash=False to make `AdapterSpec` hashable
115
113
  stop_sequences: List[str] = field(default_factory=list, hash=False)
114
+ """List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""
116
115
 
117
116
  # Random string (used concretely to bypass cache / see diverse results)
118
117
  random: Optional[str] = None
118
+ """Random seed (string), which guarantees reproducibility."""
119
119
 
120
- # If true, for instances with multiple correct reference, the gold answer should be considered
121
- # to be all the correct references rather than any of the correct references.
122
120
  multi_label: bool = False
121
+ """If true, for instances with multiple correct reference, the gold answer should be considered to be all
122
+ of the correct references rather than any of the correct references."""
123
123
 
124
- # Parameters for image generation
125
124
  image_generation_parameters: Optional[ImageGenerationParameters] = None
125
+ """Parameters for image generation."""
126
126
 
127
- # The splits from which evaluation instances will be drawn (set hash=False to make `AdapterSpec` hashable)
127
+ # Set hash=False to make `AdapterSpec` hashable
128
128
  eval_splits: Optional[List[str]] = field(default=None, hash=False)
129
+ """The splits from which evaluation instances will be drawn."""
@@ -79,6 +79,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
79
79
  # Prompt
80
80
  prompt = MultimodalPrompt(
81
81
  global_prefix=self.adapter_spec.global_prefix,
82
+ global_suffix=self.adapter_spec.global_suffix,
82
83
  instructions=self.adapter_spec.instructions,
83
84
  train_instance_blocks=train_instance_blocks,
84
85
  eval_instance_block=eval_instance_block,
@@ -11,6 +11,9 @@ class MultimodalPrompt:
11
11
  # Global prefix, carried over from `AdapterSpec`
12
12
  global_prefix: str
13
13
 
14
+ # Global suffix, carried over from `AdapterSpec`
15
+ global_suffix: str
16
+
14
17
  # Instance prefix, carried over from `AdapterSpec`. What goes between the instruction and instances.
15
18
  instance_prefix: str
16
19
 
@@ -47,6 +50,10 @@ class MultimodalPrompt:
47
50
  if self.global_prefix:
48
51
  result = result.add_textual_prefix(self.global_prefix)
49
52
 
53
+ # Add the global prefix if one exists
54
+ if self.global_suffix:
55
+ result = result.add_textual_suffix(self.global_suffix)
56
+
50
57
  return result
51
58
 
52
59
  @property
@@ -32,6 +32,7 @@ class TestMultimodalContent(unittest.TestCase):
32
32
 
33
33
  prompt = MultimodalPrompt(
34
34
  global_prefix="[START]",
35
+ global_suffix="",
35
36
  instance_prefix="\n",
36
37
  instructions="Please answer the following questions about the images.",
37
38
  train_instance_blocks=train_instance_blocks,
@@ -67,6 +68,7 @@ class TestMultimodalContent(unittest.TestCase):
67
68
 
68
69
  prompt = MultimodalPrompt(
69
70
  global_prefix="",
71
+ global_suffix="",
70
72
  instance_prefix="\n",
71
73
  instructions="",
72
74
  train_instance_blocks=[],
@@ -0,0 +1,64 @@
1
+ import datasets
2
+ import os
3
+ import re
4
+ from typing import Any
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.adaptation.request_state import RequestState
8
+ from helm.benchmark.annotation.annotator import Annotator
9
+ from helm.clients.auto_client import AutoClient
10
+ from helm.common.request import Request
11
+
12
+
13
+ class AIRBench2024Annotator(Annotator):
14
+ """The AIRBench 2024 autograder."""
15
+
16
+ name = "air_bench_2024"
17
+
18
+ def __init__(self, auto_client: AutoClient, file_storage_path: str):
19
+ self._auto_client = auto_client
20
+ cache_dir = os.path.join(file_storage_path, "data")
21
+ ensure_directory_exists(cache_dir)
22
+ dataset = datasets.load_dataset(
23
+ "stanford-crfm/air-bench-2024", "judge_prompts", split="test", cache_dir=cache_dir
24
+ )
25
+ self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
26
+ # Regex pattern is lenient to allow for typos e.g. extra whitespace
27
+ self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
28
+
29
+ def annotate(self, request_state: RequestState) -> Any:
30
+ assert request_state.result
31
+ assert len(request_state.result.completions) == 1
32
+ model_input_text = request_state.request.prompt
33
+ model_output_text = request_state.result.completions[0].text
34
+ if not model_output_text.strip():
35
+ return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
36
+ category_id = request_state.instance.references[0].output.text
37
+ prompt_template = self._category_id_to_judge_prompt[category_id]
38
+ # Strip to deal with incorrectly formatted input CSV.
39
+ # TODO: Stop stripping after CSV is fixed.
40
+ annotator_prompt = prompt_template.replace("{{QUESTION}}", model_input_text).replace(
41
+ "{{ANSWER}}", model_output_text
42
+ )
43
+ annotator_request = Request(
44
+ model="openai/gpt-4o-2024-05-13",
45
+ model_deployment="openai/gpt-4o-2024-05-13",
46
+ prompt=annotator_prompt,
47
+ temperature=0.0,
48
+ max_tokens=64,
49
+ )
50
+ annotator_response = self._auto_client.make_request(annotator_request)
51
+ if not annotator_response.success:
52
+ raise Exception(f"Annotation request failed: {annotator_response.error}")
53
+ assert len(annotator_response.completions) == 1
54
+ annotator_response_text = annotator_response.completions[0].text
55
+ annotator_response_parts = self._pattern.search(annotator_response_text)
56
+ if not annotator_response_parts:
57
+ raise Exception(f"Malformed annotator response: {annotator_response_text}")
58
+ reasoning = annotator_response_parts[1].strip()
59
+ try:
60
+ score = float(annotator_response_parts[2].strip())
61
+ except ValueError as e:
62
+ raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
63
+
64
+ return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from typing import Any, Dict, Mapping, Optional
3
3
 
4
+ from helm.clients.auto_client import AutoClient
4
5
  from helm.common.credentials_utils import provide_api_key
5
6
  from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
6
7
  from helm.common.hierarchical_logger import hlog
@@ -46,6 +47,11 @@ class AnnotatorFactory:
46
47
  provider_bindings={
47
48
  "api_key": lambda: provide_api_key(self.credentials, annotator_name),
48
49
  "file_storage_path": lambda: self._get_file_storage_path(annotator_name),
50
+ "auto_client": lambda: AutoClient(
51
+ credentials=self.credentials,
52
+ file_storage_path=self.file_storage_path,
53
+ cache_backend_config=self.cache_backend_config,
54
+ ),
49
55
  },
50
56
  )
51
57
  annotator = create_object(annotator_spec)
@@ -18,7 +18,7 @@ class LilypondCompilerAnnotator(ImageCompilerAnnotator):
18
18
  """Annotator that compiles the text completions into a music sheet with LilyPond."""
19
19
 
20
20
  name: str = "lilypond_compiler"
21
- base_path = "/home/josselin/installs/lilypond-2.24.3/bin"
21
+ base_path = "lilypond-2.24.3/bin"
22
22
 
23
23
  def __init__(self, cache_config: CacheConfig, file_storage_path: str):
24
24
  super().__init__(cache_config, file_storage_path)
@@ -0,0 +1,84 @@
1
+ import os
2
+ import re
3
+ from typing import Any
4
+
5
+ from helm.common.general import ensure_directory_exists
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.annotation.annotator import Annotator
8
+ from helm.clients.auto_client import AutoClient
9
+ from helm.common.request import Request
10
+
11
+
12
+ class LiveQAAnnotator(Annotator):
13
+ """The LiveQA autograder."""
14
+
15
+ name = "live_qa"
16
+
17
+ def __init__(self, auto_client: AutoClient, file_storage_path: str):
18
+ self._auto_client = auto_client
19
+ cache_dir = os.path.join(file_storage_path, "data")
20
+ ensure_directory_exists(cache_dir)
21
+ # Regex pattern is lenient to allow for typos e.g. extra whitespace
22
+ self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
23
+
24
+ def annotate(self, request_state: RequestState) -> Any:
25
+ assert request_state.result
26
+ assert len(request_state.result.completions) == 1
27
+ model_input_text = request_state.request.prompt
28
+ model_output_text = request_state.result.completions[0].text
29
+ if not model_output_text.strip():
30
+ return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
31
+ correct_responses = ""
32
+ for i, reference in enumerate(request_state.instance.references):
33
+ correct_responses += "Correct Response #" + str(i) + ": " + reference.output.text + "\n"
34
+ prompt_template = """You are a medical judge will be given a user request and a model response to that request. Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
35
+ The user's request will be provided in these tags:
36
+ <user_request>
37
+ {{QUESTION}}
38
+ </user_request>
39
+ The model's response will be provided in these tags:
40
+ <model_response>
41
+ {{ANSWER}}
42
+ </model_response>
43
+ Some potential correct responses will be provided in these tags:
44
+ <correct_responses>
45
+ {{CORRECT_RESPONSES}}
46
+ </correct_responses>
47
+ Carefully analyze the <model_response> and determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
48
+ # 1, if the model response fully matches the correct response content and intent for at least one correct response
49
+ # 0.7, if the model response contains correct information but is incomplete relative to the correct response(s)
50
+ # 0.3, if the model response contains some correct information but is incomplete and/or contains incorrect information relative to the correct response(s)
51
+ # 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
52
+ Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag. Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
53
+ Format:
54
+ ##short_reasoning: <your concise and brief reasoning>
55
+ ##the_score: <your score>""" # noqa: E501
56
+ annotator_prompt = (
57
+ prompt_template.strip('"')
58
+ .strip()
59
+ .replace("{{QUESTION}}", model_input_text)
60
+ .replace("{{ANSWER}}", model_output_text)
61
+ .replace("{{CORRECT_RESPONSES}}", correct_responses)
62
+ )
63
+ annotator_request = Request(
64
+ model="openai/gpt-4-turbo-2024-04-09",
65
+ model_deployment="openai/gpt-4-turbo-2024-04-09",
66
+ prompt=annotator_prompt,
67
+ temperature=0.0,
68
+ max_tokens=64,
69
+ )
70
+ annotator_response = self._auto_client.make_request(annotator_request)
71
+ if not annotator_response.success:
72
+ raise Exception(f"Annotation request failed: {annotator_response.error}")
73
+ assert len(annotator_response.completions) == 1
74
+ annotator_response_text = annotator_response.completions[0].text
75
+ annotator_response_parts = self._pattern.search(annotator_response_text)
76
+ if not annotator_response_parts:
77
+ raise Exception(f"Malformed annotator response: {annotator_response_text}")
78
+ reasoning = annotator_response_parts[1].strip()
79
+ try:
80
+ score = float(annotator_response_parts[2].strip())
81
+ except ValueError as e:
82
+ raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
83
+
84
+ return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
@@ -0,0 +1,81 @@
1
+ import os
2
+ import re
3
+ from typing import Any
4
+
5
+ from helm.common.general import ensure_directory_exists
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.annotation.annotator import Annotator
8
+ from helm.clients.auto_client import AutoClient
9
+ from helm.common.request import Request
10
+
11
+
12
+ class MedicationQAAnnotator(Annotator):
13
+ """The LiveQA autograder."""
14
+
15
+ name = "medication_qa"
16
+
17
+ def __init__(self, auto_client: AutoClient, file_storage_path: str):
18
+ self._auto_client = auto_client
19
+ cache_dir = os.path.join(file_storage_path, "data")
20
+ ensure_directory_exists(cache_dir)
21
+ # Regex pattern is lenient to allow for typos e.g. extra whitespace
22
+ self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
23
+
24
+ def annotate(self, request_state: RequestState) -> Any:
25
+ assert request_state.result
26
+ assert len(request_state.result.completions) == 1
27
+ model_input_text = request_state.request.prompt
28
+ model_output_text = request_state.result.completions[0].text
29
+ if not model_output_text.strip():
30
+ return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
31
+ correct_response = request_state.instance.references[0].output.text
32
+ prompt_template = """You are a medical judge will be given a user request and a model response to that request. Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
33
+ The user's request will be provided in these tags:
34
+ <user_request>
35
+ {{QUESTION}}
36
+ </user_request>
37
+ The model's response will be provided in these tags:
38
+ <model_response>
39
+ {{ANSWER}}
40
+ </model_response>
41
+ Some potential correct responses will be provided in these tags:
42
+ <correct_response>
43
+ {{CORRECT_RESPONSE}}
44
+ </correct_response>
45
+ Carefully analyze the <model_response> and determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
46
+ # 1, if the model response fully matches the correct response content and intent for at least one correct response
47
+ # 0.5, if the model response contains some correct information but is incomplete and/or contains incorrect information relative to the correct response(s)
48
+ # 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
49
+ Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag. Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
50
+ Format:
51
+ ##short_reasoning: <your concise and brief reasoning>
52
+ ##the_score: <your score>""" # noqa: E501
53
+ annotator_prompt = (
54
+ prompt_template.strip('"')
55
+ .strip()
56
+ .replace("{{QUESTION}}", model_input_text)
57
+ .replace("{{ANSWER}}", model_output_text)
58
+ .replace("{{CORRECT_RESPONSE}}", correct_response)
59
+ )
60
+ annotator_request = Request(
61
+ model="openai/gpt-4-turbo-2024-04-09",
62
+ model_deployment="openai/gpt-4-turbo-2024-04-09",
63
+ prompt=annotator_prompt,
64
+ temperature=0.0,
65
+ max_tokens=64,
66
+ )
67
+ annotator_response = self._auto_client.make_request(annotator_request)
68
+ if not annotator_response.success:
69
+ raise Exception(f"Annotation request failed: {annotator_response.error}")
70
+ assert len(annotator_response.completions) == 1
71
+ annotator_response_text = annotator_response.completions[0].text
72
+ annotator_response_parts = self._pattern.search(annotator_response_text)
73
+ if not annotator_response_parts:
74
+ raise Exception(f"Malformed annotator response: {annotator_response_text}")
75
+ reasoning = annotator_response_parts[1].strip()
76
+ try:
77
+ score = float(annotator_response_parts[2].strip())
78
+ except ValueError as e:
79
+ raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
80
+
81
+ return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
@@ -48,11 +48,27 @@ class TextPerturbation(Perturbation, ABC):
48
48
 
49
49
  description = replace(self.description, seed=seed)
50
50
 
51
+ perturbed_input: Input
52
+ if instance.input.multimedia_content:
53
+ perturbed_media_objects = []
54
+ for media_object in instance.input.multimedia_content.media_objects:
55
+ # Apply perturbations to the text data of the multimedia content
56
+ if media_object.is_type("text") and media_object.text is not None:
57
+ perturbed_media_objects.append(replace(media_object, text=self.perturb(media_object.text, rng)))
58
+ else:
59
+ perturbed_media_objects.append(media_object)
60
+
61
+ perturbed_input = Input(
62
+ multimedia_content=replace(instance.input.multimedia_content, media_objects=perturbed_media_objects)
63
+ )
64
+ else:
65
+ perturbed_input = Input(text=self.perturb(instance.input.text, rng))
66
+
51
67
  # Don't modify `id` of `Instance` here.
52
68
  # All the perturbed Instances generated from a single Instance should have the same ID.
53
69
  return replace(
54
70
  instance,
55
- input=Input(text=self.perturb(instance.input.text, rng)),
71
+ input=perturbed_input,
56
72
  references=references,
57
73
  perturbation=description,
58
74
  contrast_inputs=[instance.input],
@@ -2,6 +2,7 @@
2
2
  from typing import List
3
3
  import unittest
4
4
 
5
+ from helm.common.media_object import MediaObject, MultimediaObject
5
6
  from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
6
7
  from .data_augmenter import DataAugmenter
7
8
  from .extra_space_perturbation import ExtraSpacePerturbation
@@ -33,6 +34,35 @@ def test_extra_space_perturbation():
33
34
  assert instances[1].references[0].output.text == "some name"
34
35
 
35
36
 
37
+ def test_multimodal_text_perturbation():
38
+ data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
39
+ input: Input = Input(
40
+ multimedia_content=MultimediaObject(
41
+ [
42
+ MediaObject(text="Hello what is", content_type="text/plain"),
43
+ MediaObject(text="your name", content_type="text/plain"),
44
+ ]
45
+ )
46
+ )
47
+ instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
48
+ instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
49
+
50
+ assert len(instances) == 2
51
+
52
+ # Test that the first instance is unperturbed
53
+ assert instances[0].id == "id0"
54
+ assert instances[0].perturbation is None
55
+ media_objects = instances[0].input.multimedia_content.media_objects
56
+ assert media_objects[0].text == "Hello what is"
57
+ assert media_objects[1].text == "your name"
58
+
59
+ assert instances[1].id == "id0"
60
+ assert instances[1].perturbation.name == "extra_space"
61
+ media_objects = instances[1].input.multimedia_content.media_objects
62
+ assert media_objects[0].text == "Hello what is"
63
+ assert media_objects[1].text == "your name"
64
+
65
+
36
66
  def test_misspelling_perturbation():
37
67
  data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
38
68
  instance: Instance = Instance(
@@ -17,6 +17,7 @@ class TranslatePerturbation(TextPerturbation):
17
17
  language_code: str = "zh-CN"
18
18
 
19
19
  name: str = "translate"
20
+ should_perturb_references: bool = True
20
21
 
21
22
  def __init__(self, language_code: str):
22
23
  self.language_code: str = language_code
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import Optional
2
+ from typing import Optional, Dict, Union
3
3
 
4
4
  from helm.benchmark.model_deployment_registry import (
5
5
  ClientSpec,
@@ -17,14 +17,22 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
17
17
 
18
18
 
19
19
  def register_huggingface_model(
20
- helm_model_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None
20
+ helm_model_name: str,
21
+ pretrained_model_name_or_path: str,
22
+ revision: Optional[str] = None,
23
+ openvino: Optional[bool] = False,
21
24
  ) -> None:
22
- object_spec_args = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
25
+ object_spec_args: Dict[str, Union[str, bool]] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
23
26
  if revision:
24
27
  object_spec_args["revision"] = revision
28
+ if openvino:
29
+ object_spec_args["openvino"] = openvino
25
30
 
26
31
  # Auto-infer model properties from the tokenizer.
27
- with HuggingFaceTokenizer.create_tokenizer(**object_spec_args) as tokenizer:
32
+ create_tokenizer_args: Dict[str, str] = {"pretrained_model_name_or_path": pretrained_model_name_or_path}
33
+ if revision:
34
+ create_tokenizer_args["revision"] = revision
35
+ with HuggingFaceTokenizer.create_tokenizer(**create_tokenizer_args) as tokenizer:
28
36
  max_sequence_length = tokenizer.model_max_length
29
37
  end_of_text_token = tokenizer.eos_token or ""
30
38
  prefix_token = tokenizer.bos_token or ""
@@ -71,7 +79,7 @@ def register_huggingface_model(
71
79
  register_tokenizer_config(tokenizer_config)
72
80
 
73
81
 
74
- def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> None:
82
+ def register_huggingface_hub_model_from_flag_value(raw_model_string: str, openvino=False) -> None:
75
83
  raw_model_string_parts = raw_model_string.split("@")
76
84
  pretrained_model_name_or_path: str
77
85
  revision: Optional[str]
@@ -88,10 +96,11 @@ def register_huggingface_hub_model_from_flag_value(raw_model_string: str) -> Non
88
96
  helm_model_name=raw_model_string,
89
97
  pretrained_model_name_or_path=pretrained_model_name_or_path,
90
98
  revision=revision,
99
+ openvino=openvino,
91
100
  )
92
101
 
93
102
 
94
- def register_huggingface_local_model_from_flag_value(path: str) -> None:
103
+ def register_huggingface_local_model_from_flag_value(path: str, openvino=False) -> None:
95
104
  if not path:
96
105
  raise ValueError("Path to Hugging Face model must be non-empty")
97
106
  path_parts = os.path.split(path)
@@ -99,4 +108,5 @@ def register_huggingface_local_model_from_flag_value(path: str) -> None:
99
108
  register_huggingface_model(
100
109
  helm_model_name=helm_model_name,
101
110
  pretrained_model_name_or_path=path,
111
+ openvino=openvino,
102
112
  )
@@ -0,0 +1,56 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.basic_metrics import compute_request_state_metrics
6
+ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
7
+ from helm.benchmark.metrics.metric import Metric
8
+ from helm.benchmark.metrics.metric_name import MetricName
9
+ from helm.benchmark.metrics.metric_service import MetricService
10
+ from helm.benchmark.metrics.statistic import Stat
11
+
12
+
13
+ class AIRBench2024BasicGenerationMetric(Metric):
14
+ """Replacement for BasicGenerationMetric for AIRBench 2024.
15
+
16
+ We call compute_request_state_metrics here because we can't use `BasicGenerationMetric`
17
+ because we abuse "references" to store metadata rather than true metadata."""
18
+
19
+ def __init__(self):
20
+ super().__init__()
21
+ self.efficiency_metric = EfficiencyMetric()
22
+
23
+ def evaluate_generation(
24
+ self,
25
+ adapter_spec: AdapterSpec,
26
+ request_state: RequestState,
27
+ metric_service: MetricService,
28
+ eval_cache_path: str,
29
+ ) -> List[Stat]:
30
+ return compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)
31
+
32
+
33
+ class AIRBench2024ScoreMetric(Metric):
34
+ """Score metrics for AIRBench 2024."""
35
+
36
+ def evaluate_generation(
37
+ self,
38
+ adapter_spec: AdapterSpec,
39
+ request_state: RequestState,
40
+ metric_service: MetricService,
41
+ eval_cache_path: str,
42
+ ) -> List[Stat]:
43
+ assert len(request_state.instance.references) > 1
44
+ category_text = request_state.instance.references[0].output.text
45
+ category_parts = category_text.split(".")
46
+ assert len(category_parts) == 3
47
+ assert request_state.annotations
48
+ score = request_state.annotations["air_bench_2024"]["score"]
49
+ return [
50
+ Stat(MetricName("air_score")).add(score),
51
+ Stat(MetricName(f"air_category_{category_parts[0]}_score")).add(score),
52
+ Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_score")).add(score),
53
+ Stat(MetricName(f"air_category_{category_parts[0]}_{category_parts[1]}_{category_parts[2]}_score")).add(
54
+ score
55
+ ),
56
+ ]
@@ -91,8 +91,15 @@ class EfficiencyMetric:
91
91
  window_service: WindowService = WindowServiceFactory.get_window_service(
92
92
  adapter_spec.model_deployment, tokenizer_service
93
93
  )
94
- prompt: str = request_state.request.prompt
95
- num_prompt_tokens: int = window_service.get_num_tokens(prompt)
94
+
95
+ prompt: str
96
+ num_prompt_tokens: int
97
+ if request_state.request.multimodal_prompt is not None:
98
+ prompt = request_state.request.multimodal_prompt.text
99
+ num_prompt_tokens = window_service.get_num_tokens(prompt)
100
+ else:
101
+ prompt = request_state.request.prompt
102
+ num_prompt_tokens = window_service.get_num_tokens(prompt)
96
103
 
97
104
  # Total number of tokens in the completion.
98
105
  num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])