crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (56) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
  3. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  5. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  6. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  7. helm/benchmark/augmentations/perturbation.py +17 -1
  8. helm/benchmark/augmentations/test_perturbation.py +30 -0
  9. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  10. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  11. helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
  12. helm/benchmark/model_metadata_registry.py +5 -1
  13. helm/benchmark/run_expander.py +35 -63
  14. helm/benchmark/run_spec_factory.py +11 -10
  15. helm/benchmark/run_specs/vlm_run_specs.py +294 -38
  16. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  17. helm/benchmark/scenarios/math_scenario.py +1 -1
  18. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  19. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  20. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  21. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  22. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  23. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
  24. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
  25. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  26. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  27. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  28. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  29. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  30. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  31. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  32. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  33. helm/benchmark/static/schema_image2structure.yaml +304 -0
  34. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  35. helm/benchmark/static/schema_vlm.yaml +257 -10
  36. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  37. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  38. helm/benchmark/static_build/index.html +2 -2
  39. helm/clients/anthropic_client.py +36 -6
  40. helm/clients/openai_client.py +2 -3
  41. helm/clients/together_client.py +93 -2
  42. helm/clients/vertexai_client.py +59 -50
  43. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  44. helm/clients/vision_language/huggingface_vlm_client.py +11 -4
  45. helm/clients/vision_language/idefics_client.py +2 -2
  46. helm/common/images_utils.py +10 -3
  47. helm/config/model_deployments.yaml +100 -2
  48. helm/config/model_metadata.yaml +136 -31
  49. helm/config/tokenizer_configs.yaml +7 -0
  50. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  51. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  52. helm/benchmark/test_model_deployment_definition.py +0 -90
  53. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  54. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
  55. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  56. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
@@ -28,7 +28,7 @@ from helm.benchmark.metrics.vision_language.image_utils import (
28
28
  pixel_similarity,
29
29
  sift_similarity,
30
30
  )
31
- from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive
31
+ from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color
32
32
 
33
33
  try:
34
34
  from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
@@ -78,7 +78,9 @@ class AnnotatedImageMetrics(Metric):
78
78
 
79
79
  # Metric names
80
80
  COMPILE_METRIC: str = "compilation_success"
81
- EARTH_MOVER_SIMILARITY: str = "earth_mover_similarity"
81
+ BLOCK_EARTH_MOVER_SIMILARITY_NORM1: str = "block_emd_similarity_white"
82
+ BLOCK_EARTH_MOVER_SIMILARITY_NORM2: str = "block_emd_similarity_median_color"
83
+ BLOCK_EARTH_MOVER_SIMILARITY: str = "block_emd_similarity"
82
84
  PIXEL_SIMILARITY: str = "pixel_similarity"
83
85
  SIFT_SIMILARITY: str = "sift_similarity"
84
86
  LPIPS_SIMILARITY: str = "lpips_similarity"
@@ -106,7 +108,12 @@ class AnnotatedImageMetrics(Metric):
106
108
  metrics: List[AnnotatedMetric] = [
107
109
  AnnotatedMetric(self.PIXEL_SIMILARITY, pixel_similarity, "image_np_gray"),
108
110
  AnnotatedMetric(self.SIFT_SIMILARITY, sift_similarity, "image_np"),
109
- AnnotatedMetric(self.EARTH_MOVER_SIMILARITY, self.compute_emd_similarity_recursive, "image_PIL"),
111
+ # Raw block EMD
112
+ AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY, self.compute_block_emd_raw, "image_PIL"),
113
+ # Normalized block EMD against white
114
+ AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1, self.compute_block_emd_white, "image_PIL"),
115
+ # Normalized block EMD against median
116
+ AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2, self.compute_block_emd_median, "image_PIL"),
110
117
  AnnotatedMetric(self.LPIPS_SIMILARITY, self.lpips_similarity, "image_PIL"),
111
118
  AnnotatedMetric(self.FID_SIMILARITY, self.fid_similarity, "image_PIL"),
112
119
  AnnotatedMetric(self.SSIM_SIMILARITY, self.compute_ssim, "image_np_gray"),
@@ -407,7 +414,7 @@ class AnnotatedImageMetrics(Metric):
407
414
  result = _edit_similarity(completion_tokens, truncated_reference_tokens)
408
415
  return result
409
416
 
410
- def compute_emd_similarity_recursive(
417
+ def compute_block_emd_white(
411
418
  self,
412
419
  pred_image: Image.Image,
413
420
  ref_image: Image.Image,
@@ -417,17 +424,23 @@ class AnnotatedImageMetrics(Metric):
417
424
  weight_most_frequent_color: float = 0.001,
418
425
  use_tqdm: bool = False,
419
426
  ):
420
- emd_value = compute_emd_recursive(
421
- pred_image,
422
- ref_image,
423
- threshold_most_frequent_color,
424
- patch_size,
425
- max_num_patches,
426
- weight_most_frequent_color,
427
- use_tqdm,
428
- )
427
+ """Computes the block Earth Moving Distance (EMD). This attempts to
428
+ speed up EMD for images with huge areas by considering movement/transformatio
429
+ of blocks of pixels. The score is normalized against EMD against white images
430
+ """
429
431
 
430
- def do_it():
432
+ def compute_numerator():
433
+ return self.compute_block_emd_raw_wrapper(
434
+ pred_image,
435
+ ref_image,
436
+ threshold_most_frequent_color,
437
+ patch_size,
438
+ max_num_patches,
439
+ weight_most_frequent_color,
440
+ use_tqdm,
441
+ )
442
+
443
+ def compute_denominator():
431
444
  constant_image = Image.new("RGB", ref_image.size, (255, 255, 255)) # default color is white
432
445
  value = compute_emd_recursive(
433
446
  constant_image,
@@ -443,8 +456,120 @@ class AnnotatedImageMetrics(Metric):
443
456
  hash_dict = {
444
457
  "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
445
458
  }
446
- cache_key = {"metric_name": f"intermediate_{self.EARTH_MOVER_SIMILARITY}", **hash_dict}
459
+ cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
460
+ cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1}", **hash_dict}
461
+
462
+ assert self._cache is not None
463
+ emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
464
+ emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
465
+
466
+ return 1.0 - emd_raw["value"] / emd_base["value"]
467
+
468
+ def compute_block_emd_median(
469
+ self,
470
+ pred_image: Image.Image,
471
+ ref_image: Image.Image,
472
+ threshold_most_frequent_color: float = 0.5,
473
+ patch_size: Tuple[int, int] = (8, 8),
474
+ max_num_patches: int = 100,
475
+ weight_most_frequent_color: float = 0.001,
476
+ use_tqdm: bool = False,
477
+ ):
478
+ """Same as compute_emd_similarity_recursive EXCEPT that
479
+ the normalization is against an image of the median color.
480
+ """
481
+
482
+ def compute_numerator():
483
+ return self.compute_block_emd_raw_wrapper(
484
+ pred_image,
485
+ ref_image,
486
+ threshold_most_frequent_color,
487
+ patch_size,
488
+ max_num_patches,
489
+ weight_most_frequent_color,
490
+ use_tqdm,
491
+ )
492
+
493
+ def compute_denominator():
494
+ ref_img_np = np.array(ref_image)
495
+ (rgb_most_frequent_color, _) = get_most_frequent_color(ref_img_np)
496
+
497
+ # Most frequent color as base
498
+ constant_image = Image.new("RGB", ref_image.size, tuple(rgb_most_frequent_color)) # type: ignore
499
+ value = compute_emd_recursive(
500
+ constant_image,
501
+ ref_image,
502
+ threshold_most_frequent_color,
503
+ patch_size,
504
+ max_num_patches,
505
+ weight_most_frequent_color,
506
+ use_tqdm,
507
+ )
508
+ return {"value": value}
509
+
510
+ hash_dict = {
511
+ "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
512
+ }
513
+ cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
514
+ cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2}", **hash_dict}
515
+
516
+ assert self._cache is not None
517
+ emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
518
+ emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
519
+
520
+ return 1.0 - emd_raw["value"] / emd_base["value"]
521
+
522
+ def compute_block_emd_raw(
523
+ self,
524
+ pred_image: Image.Image,
525
+ ref_image: Image.Image,
526
+ threshold_most_frequent_color: float = 0.5,
527
+ patch_size: Tuple[int, int] = (8, 8),
528
+ max_num_patches: int = 100,
529
+ weight_most_frequent_color: float = 0.001,
530
+ use_tqdm: bool = False,
531
+ ):
532
+ def compute():
533
+ return self.compute_block_emd_raw_wrapper(
534
+ pred_image,
535
+ ref_image,
536
+ threshold_most_frequent_color,
537
+ patch_size,
538
+ max_num_patches,
539
+ weight_most_frequent_color,
540
+ use_tqdm,
541
+ )
542
+
543
+ hash_dict = {
544
+ "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
545
+ }
546
+ cache_key = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
447
547
  assert self._cache is not None
448
- response_metric, _ = self._cache.get(cache_key, do_it)
548
+ emd_raw, _ = self._cache.get(cache_key, compute)
549
+
550
+ return emd_raw["value"]
449
551
 
450
- return 1.0 - emd_value / response_metric["value"]
552
+ def compute_block_emd_raw_wrapper(
553
+ self,
554
+ pred_image: Image.Image,
555
+ ref_image: Image.Image,
556
+ threshold_most_frequent_color: float = 0.5,
557
+ patch_size: Tuple[int, int] = (8, 8),
558
+ max_num_patches: int = 100,
559
+ weight_most_frequent_color: float = 0.001,
560
+ use_tqdm: bool = False,
561
+ ):
562
+ """Computes the block Earth Moving Distance (EMD). This attempts to
563
+ speed up EMD for images with huge areas by considering movement/transformatio
564
+ of blocks of pixels. The score is normalized against EMD against white images
565
+ """
566
+ emd_value = compute_emd_recursive(
567
+ pred_image,
568
+ ref_image,
569
+ threshold_most_frequent_color,
570
+ patch_size,
571
+ max_num_patches,
572
+ weight_most_frequent_color,
573
+ use_tqdm,
574
+ )
575
+ return {"value": emd_value}
@@ -32,6 +32,7 @@ ANTHROPIC_CLAUDE_3_MODEL_TAG: str = "ANTHROPIC_CLAUDE_3_MODEL_TAG"
32
32
 
33
33
  GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
34
34
  GOOGLE_GEMINI_MODEL_TAG: str = "GOOGLE_GEMINI_MODEL_TAG"
35
+ GOOGLE_GEMINI_PRO_VISION_V1_TAG: str = "GOOGLE_GEMINI_PRO_VISION_V1_TAG"
35
36
  GOOGLE_GEMMA_INSTRUCT_MODEL_TAG: str = "GOOGLE_GEMMA_INSTRUCT_MODEL_TAG"
36
37
 
37
38
  # Models which emit garbage tokens when temperature=0.
@@ -159,7 +160,10 @@ def register_model_metadata(model_metadata: ModelMetadata) -> None:
159
160
  def get_model_metadata(model_name: str) -> ModelMetadata:
160
161
  """Return the `ModelMetadata` for the model name."""
161
162
  if model_name not in MODEL_NAME_TO_MODEL_METADATA:
162
- raise ValueError(f"No model with name: {model_name}")
163
+ raise ValueError(
164
+ f"No model metadata for model name: {model_name} - "
165
+ "did you remember to add this model to model_metadata.yaml?"
166
+ )
163
167
 
164
168
  return MODEL_NAME_TO_MODEL_METADATA[model_name]
165
169
 
@@ -8,12 +8,14 @@ from helm.benchmark.model_metadata_registry import (
8
8
  get_all_code_models,
9
9
  get_all_models,
10
10
  get_all_text_models,
11
+ get_model_metadata,
11
12
  get_model_names_with_tag,
12
13
  FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
13
14
  LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
14
15
  ABLATION_MODEL_TAG,
15
16
  TEXT_TO_IMAGE_MODEL_TAG,
16
17
  VISION_LANGUAGE_MODEL_TAG,
18
+ INSTRUCTION_FOLLOWING_MODEL_TAG,
17
19
  )
18
20
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
19
21
  from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
@@ -322,6 +324,16 @@ class AnthropicClaude3RunExpander(RunExpander):
322
324
  name = "claude_3"
323
325
 
324
326
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
327
+ # Remove all stop sequences that do not contain non-whitespace characters.
328
+ # This prevents the Anthropic API from returnin the following error:
329
+ # "stop_sequences: each stop sequence must contain non-whitespace"
330
+ stop_sequences_with_non_whitespace = [
331
+ stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
332
+ ]
333
+ run_spec = replace(
334
+ run_spec,
335
+ adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
336
+ )
325
337
  if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
326
338
  instructions = "Answer with only a single letter."
327
339
  if run_spec.adapter_spec.instructions:
@@ -335,78 +347,37 @@ class AnthropicClaude3RunExpander(RunExpander):
335
347
  return [run_spec]
336
348
 
337
349
 
338
- class OpenAIRunExpander(RunExpander):
339
- """
340
- Custom prompt for OpenAI models.
341
- These models need more explicit instructions about following the format.
342
- """
343
-
344
- # TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
345
-
346
- name = "openai"
350
+ class FollowFormatInstructionsRunExpander(RunExpander):
351
+ """Adds more explicit instructions about following the format to prompts.
347
352
 
348
- def __init__(self):
349
- pass
353
+ The argument controlls which models will receive these instructions.
354
+ If "all", all models receive these instructions.
355
+ If "instruct", only instruction-following models receive these instructions.
350
356
 
351
- def expand(self, run_spec: RunSpec) -> List[RunSpec]:
352
- if run_spec.adapter_spec.method != ADAPT_GENERATION:
353
- return [run_spec]
354
-
355
- return [
356
- replace(
357
- run_spec,
358
- name=run_spec.name,
359
- adapter_spec=replace(
360
- run_spec.adapter_spec,
361
- global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
362
- global_suffix="\n\n"
363
- + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
364
- + "\n"
365
- + run_spec.adapter_spec.output_prefix.strip(),
366
- ),
367
- ),
368
- ]
357
+ Only supports the generation adaptation method. Raises an error if used on
358
+ a RunSpec that uses a different adaptation method.
369
359
 
370
-
371
- class GoogleRunExpander(RunExpander):
372
- """
373
- Custom prompt for Google models.
374
- These models need more explicit instructions about following the format.
360
+ Note: For legacy backwards compatibility reasons, despite the use of the word
361
+ "instructions" in this run expander's name, this run expander actually
362
+ modifies the global_prefix and the global_suffix of the AdapterSpec rather than
363
+ the instructions.
375
364
  """
376
365
 
377
- # TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
366
+ name = "follow_format_instructions"
378
367
 
379
- name = "google"
368
+ def __init__(self, value: str):
369
+ if value != "all" and value != "instruct":
370
+ raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
371
+ self.value = value
380
372
 
381
373
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
382
374
  if run_spec.adapter_spec.method != ADAPT_GENERATION:
383
- return [run_spec]
384
-
385
- return [
386
- replace(
387
- run_spec,
388
- name=run_spec.name,
389
- adapter_spec=replace(
390
- run_spec.adapter_spec,
391
- global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
392
- global_suffix="\n\n"
393
- + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
394
- + "\n"
395
- + run_spec.adapter_spec.output_prefix.strip(),
396
- ),
397
- ),
398
- ]
375
+ raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
399
376
 
400
-
401
- class MistralRunExpander(RunExpander):
402
- """Custom prompt for Mistral models."""
403
-
404
- # TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
405
-
406
- name = "output_format_instructions"
407
-
408
- def expand(self, run_spec: RunSpec) -> List[RunSpec]:
409
- if run_spec.adapter_spec.method != ADAPT_GENERATION:
377
+ if (
378
+ self.value == "instruct"
379
+ and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
380
+ ):
410
381
  return [run_spec]
411
382
 
412
383
  return [
@@ -539,7 +510,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
539
510
  "one": [1],
540
511
  "all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
541
512
  "big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
542
- "heim_human_eval": [0, 1, 2, 4, 8],
513
+ "vhelm": [0, 1, 2, 4, 8],
543
514
  }
544
515
 
545
516
 
@@ -1415,6 +1386,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1415
1386
  NewlineRunExpander,
1416
1387
  StopRunExpander,
1417
1388
  FormatPromptRunExpander,
1389
+ FollowFormatInstructionsRunExpander,
1418
1390
  AddToStopRunExpander,
1419
1391
  GlobalPrefixRunExpander,
1420
1392
  NumTrainTrialsRunExpander,
@@ -4,7 +4,6 @@ from typing import List
4
4
  from helm.benchmark.adaptation.adapter_spec import (
5
5
  ADAPT_GENERATION,
6
6
  ADAPT_MULTIPLE_CHOICE_JOINT,
7
- ADAPT_GENERATION_MULTIMODAL,
8
7
  )
9
8
  from helm.benchmark.model_deployment_registry import (
10
9
  ModelDeployment,
@@ -14,22 +13,24 @@ from helm.benchmark.model_deployment_registry import (
14
13
  from helm.benchmark.model_metadata_registry import (
15
14
  ANTHROPIC_CLAUDE_1_MODEL_TAG,
16
15
  ANTHROPIC_CLAUDE_2_MODEL_TAG,
16
+ ANTHROPIC_CLAUDE_3_MODEL_TAG,
17
17
  BUGGY_TEMP_0_TAG,
18
18
  CHATML_MODEL_TAG,
19
- GOOGLE_GEMINI_MODEL_TAG,
19
+ GOOGLE_GEMINI_PRO_VISION_V1_TAG,
20
20
  IDEFICS_INSTRUCT_MODEL_TAG,
21
- IDEFICS_MODEL_TAG,
22
21
  LLAVA_MODEL_TAG,
23
22
  OPEN_FLAMINGO_MODEL_TAG,
24
- VISION_LANGUAGE_MODEL_TAG,
25
23
  NLG_PREFIX_TAG,
26
24
  NO_NEWLINES_TAG,
25
+ VISION_LANGUAGE_MODEL_TAG,
26
+ IDEFICS_MODEL_TAG,
27
27
  ModelMetadata,
28
28
  get_model_metadata,
29
29
  )
30
30
  from helm.benchmark.run_expander import (
31
31
  RUN_EXPANDERS,
32
32
  AnthropicClaude2RunExpander,
33
+ AnthropicClaude3RunExpander,
33
34
  ChatMLRunExpander,
34
35
  GlobalPrefixRunExpander,
35
36
  IDEFICSInstructRunExpander,
@@ -125,20 +126,20 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
125
126
  if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
126
127
  run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
127
128
 
128
- # Google Gemini Vision returns an empty completion or throws an error if max_tokens is 1
129
+ # Anthropic Claude 3
130
+ if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
131
+ run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
132
+
133
+ # Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
129
134
  if (
130
135
  VISION_LANGUAGE_MODEL_TAG in model.tags
131
- and GOOGLE_GEMINI_MODEL_TAG in model.tags
136
+ and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
132
137
  and run_spec.adapter_spec.max_tokens == 1
133
138
  ):
134
139
  run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
135
140
 
136
141
  # IDEFICS special handling
137
142
  if IDEFICS_MODEL_TAG in model.tags:
138
- # IDEFICS requires more `max_tokens` to generate something reasonable for open-ended generation
139
- if run_spec.adapter_spec.method == ADAPT_GENERATION_MULTIMODAL:
140
- run_spec = singleton(IncreaseMaxTokensRunExpander(value=30).expand(run_spec))
141
-
142
143
  if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
143
144
  run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
144
145