crfm-helm 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +7 -3
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/RECORD +53 -41
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +142 -17
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/run_expander.py +35 -63
- helm/benchmark/run_spec_factory.py +11 -10
- helm/benchmark/run_specs/vlm_run_specs.py +294 -38
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +257 -10
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +36 -6
- helm/clients/openai_client.py +2 -3
- helm/clients/together_client.py +93 -2
- helm/clients/vertexai_client.py +59 -50
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +11 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/common/images_utils.py +10 -3
- helm/config/model_deployments.yaml +100 -2
- helm/config/model_metadata.yaml +136 -31
- helm/config/tokenizer_configs.yaml +7 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -28,7 +28,7 @@ from helm.benchmark.metrics.vision_language.image_utils import (
|
|
|
28
28
|
pixel_similarity,
|
|
29
29
|
sift_similarity,
|
|
30
30
|
)
|
|
31
|
-
from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive
|
|
31
|
+
from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color
|
|
32
32
|
|
|
33
33
|
try:
|
|
34
34
|
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
|
|
@@ -78,7 +78,9 @@ class AnnotatedImageMetrics(Metric):
|
|
|
78
78
|
|
|
79
79
|
# Metric names
|
|
80
80
|
COMPILE_METRIC: str = "compilation_success"
|
|
81
|
-
|
|
81
|
+
BLOCK_EARTH_MOVER_SIMILARITY_NORM1: str = "block_emd_similarity_white"
|
|
82
|
+
BLOCK_EARTH_MOVER_SIMILARITY_NORM2: str = "block_emd_similarity_median_color"
|
|
83
|
+
BLOCK_EARTH_MOVER_SIMILARITY: str = "block_emd_similarity"
|
|
82
84
|
PIXEL_SIMILARITY: str = "pixel_similarity"
|
|
83
85
|
SIFT_SIMILARITY: str = "sift_similarity"
|
|
84
86
|
LPIPS_SIMILARITY: str = "lpips_similarity"
|
|
@@ -106,7 +108,12 @@ class AnnotatedImageMetrics(Metric):
|
|
|
106
108
|
metrics: List[AnnotatedMetric] = [
|
|
107
109
|
AnnotatedMetric(self.PIXEL_SIMILARITY, pixel_similarity, "image_np_gray"),
|
|
108
110
|
AnnotatedMetric(self.SIFT_SIMILARITY, sift_similarity, "image_np"),
|
|
109
|
-
|
|
111
|
+
# Raw block EMD
|
|
112
|
+
AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY, self.compute_block_emd_raw, "image_PIL"),
|
|
113
|
+
# Normalized block EMD against white
|
|
114
|
+
AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1, self.compute_block_emd_white, "image_PIL"),
|
|
115
|
+
# Normalized block EMD against median
|
|
116
|
+
AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2, self.compute_block_emd_median, "image_PIL"),
|
|
110
117
|
AnnotatedMetric(self.LPIPS_SIMILARITY, self.lpips_similarity, "image_PIL"),
|
|
111
118
|
AnnotatedMetric(self.FID_SIMILARITY, self.fid_similarity, "image_PIL"),
|
|
112
119
|
AnnotatedMetric(self.SSIM_SIMILARITY, self.compute_ssim, "image_np_gray"),
|
|
@@ -407,7 +414,7 @@ class AnnotatedImageMetrics(Metric):
|
|
|
407
414
|
result = _edit_similarity(completion_tokens, truncated_reference_tokens)
|
|
408
415
|
return result
|
|
409
416
|
|
|
410
|
-
def
|
|
417
|
+
def compute_block_emd_white(
|
|
411
418
|
self,
|
|
412
419
|
pred_image: Image.Image,
|
|
413
420
|
ref_image: Image.Image,
|
|
@@ -417,17 +424,23 @@ class AnnotatedImageMetrics(Metric):
|
|
|
417
424
|
weight_most_frequent_color: float = 0.001,
|
|
418
425
|
use_tqdm: bool = False,
|
|
419
426
|
):
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
patch_size,
|
|
425
|
-
max_num_patches,
|
|
426
|
-
weight_most_frequent_color,
|
|
427
|
-
use_tqdm,
|
|
428
|
-
)
|
|
427
|
+
"""Computes the block Earth Moving Distance (EMD). This attempts to
|
|
428
|
+
speed up EMD for images with huge areas by considering movement/transformatio
|
|
429
|
+
of blocks of pixels. The score is normalized against EMD against white images
|
|
430
|
+
"""
|
|
429
431
|
|
|
430
|
-
def
|
|
432
|
+
def compute_numerator():
|
|
433
|
+
return self.compute_block_emd_raw_wrapper(
|
|
434
|
+
pred_image,
|
|
435
|
+
ref_image,
|
|
436
|
+
threshold_most_frequent_color,
|
|
437
|
+
patch_size,
|
|
438
|
+
max_num_patches,
|
|
439
|
+
weight_most_frequent_color,
|
|
440
|
+
use_tqdm,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
def compute_denominator():
|
|
431
444
|
constant_image = Image.new("RGB", ref_image.size, (255, 255, 255)) # default color is white
|
|
432
445
|
value = compute_emd_recursive(
|
|
433
446
|
constant_image,
|
|
@@ -443,8 +456,120 @@ class AnnotatedImageMetrics(Metric):
|
|
|
443
456
|
hash_dict = {
|
|
444
457
|
"reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
|
|
445
458
|
}
|
|
446
|
-
|
|
459
|
+
cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
|
|
460
|
+
cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1}", **hash_dict}
|
|
461
|
+
|
|
462
|
+
assert self._cache is not None
|
|
463
|
+
emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
|
|
464
|
+
emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
|
|
465
|
+
|
|
466
|
+
return 1.0 - emd_raw["value"] / emd_base["value"]
|
|
467
|
+
|
|
468
|
+
def compute_block_emd_median(
|
|
469
|
+
self,
|
|
470
|
+
pred_image: Image.Image,
|
|
471
|
+
ref_image: Image.Image,
|
|
472
|
+
threshold_most_frequent_color: float = 0.5,
|
|
473
|
+
patch_size: Tuple[int, int] = (8, 8),
|
|
474
|
+
max_num_patches: int = 100,
|
|
475
|
+
weight_most_frequent_color: float = 0.001,
|
|
476
|
+
use_tqdm: bool = False,
|
|
477
|
+
):
|
|
478
|
+
"""Same as compute_emd_similarity_recursive EXCEPT that
|
|
479
|
+
the normalization is against an image of the median color.
|
|
480
|
+
"""
|
|
481
|
+
|
|
482
|
+
def compute_numerator():
|
|
483
|
+
return self.compute_block_emd_raw_wrapper(
|
|
484
|
+
pred_image,
|
|
485
|
+
ref_image,
|
|
486
|
+
threshold_most_frequent_color,
|
|
487
|
+
patch_size,
|
|
488
|
+
max_num_patches,
|
|
489
|
+
weight_most_frequent_color,
|
|
490
|
+
use_tqdm,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
def compute_denominator():
|
|
494
|
+
ref_img_np = np.array(ref_image)
|
|
495
|
+
(rgb_most_frequent_color, _) = get_most_frequent_color(ref_img_np)
|
|
496
|
+
|
|
497
|
+
# Most frequent color as base
|
|
498
|
+
constant_image = Image.new("RGB", ref_image.size, tuple(rgb_most_frequent_color)) # type: ignore
|
|
499
|
+
value = compute_emd_recursive(
|
|
500
|
+
constant_image,
|
|
501
|
+
ref_image,
|
|
502
|
+
threshold_most_frequent_color,
|
|
503
|
+
patch_size,
|
|
504
|
+
max_num_patches,
|
|
505
|
+
weight_most_frequent_color,
|
|
506
|
+
use_tqdm,
|
|
507
|
+
)
|
|
508
|
+
return {"value": value}
|
|
509
|
+
|
|
510
|
+
hash_dict = {
|
|
511
|
+
"reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
|
|
512
|
+
}
|
|
513
|
+
cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
|
|
514
|
+
cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2}", **hash_dict}
|
|
515
|
+
|
|
516
|
+
assert self._cache is not None
|
|
517
|
+
emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
|
|
518
|
+
emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
|
|
519
|
+
|
|
520
|
+
return 1.0 - emd_raw["value"] / emd_base["value"]
|
|
521
|
+
|
|
522
|
+
def compute_block_emd_raw(
|
|
523
|
+
self,
|
|
524
|
+
pred_image: Image.Image,
|
|
525
|
+
ref_image: Image.Image,
|
|
526
|
+
threshold_most_frequent_color: float = 0.5,
|
|
527
|
+
patch_size: Tuple[int, int] = (8, 8),
|
|
528
|
+
max_num_patches: int = 100,
|
|
529
|
+
weight_most_frequent_color: float = 0.001,
|
|
530
|
+
use_tqdm: bool = False,
|
|
531
|
+
):
|
|
532
|
+
def compute():
|
|
533
|
+
return self.compute_block_emd_raw_wrapper(
|
|
534
|
+
pred_image,
|
|
535
|
+
ref_image,
|
|
536
|
+
threshold_most_frequent_color,
|
|
537
|
+
patch_size,
|
|
538
|
+
max_num_patches,
|
|
539
|
+
weight_most_frequent_color,
|
|
540
|
+
use_tqdm,
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
hash_dict = {
|
|
544
|
+
"reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
|
|
545
|
+
}
|
|
546
|
+
cache_key = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
|
|
447
547
|
assert self._cache is not None
|
|
448
|
-
|
|
548
|
+
emd_raw, _ = self._cache.get(cache_key, compute)
|
|
549
|
+
|
|
550
|
+
return emd_raw["value"]
|
|
449
551
|
|
|
450
|
-
|
|
552
|
+
def compute_block_emd_raw_wrapper(
|
|
553
|
+
self,
|
|
554
|
+
pred_image: Image.Image,
|
|
555
|
+
ref_image: Image.Image,
|
|
556
|
+
threshold_most_frequent_color: float = 0.5,
|
|
557
|
+
patch_size: Tuple[int, int] = (8, 8),
|
|
558
|
+
max_num_patches: int = 100,
|
|
559
|
+
weight_most_frequent_color: float = 0.001,
|
|
560
|
+
use_tqdm: bool = False,
|
|
561
|
+
):
|
|
562
|
+
"""Computes the block Earth Moving Distance (EMD). This attempts to
|
|
563
|
+
speed up EMD for images with huge areas by considering movement/transformatio
|
|
564
|
+
of blocks of pixels. The score is normalized against EMD against white images
|
|
565
|
+
"""
|
|
566
|
+
emd_value = compute_emd_recursive(
|
|
567
|
+
pred_image,
|
|
568
|
+
ref_image,
|
|
569
|
+
threshold_most_frequent_color,
|
|
570
|
+
patch_size,
|
|
571
|
+
max_num_patches,
|
|
572
|
+
weight_most_frequent_color,
|
|
573
|
+
use_tqdm,
|
|
574
|
+
)
|
|
575
|
+
return {"value": emd_value}
|
|
@@ -32,6 +32,7 @@ ANTHROPIC_CLAUDE_3_MODEL_TAG: str = "ANTHROPIC_CLAUDE_3_MODEL_TAG"
|
|
|
32
32
|
|
|
33
33
|
GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
|
|
34
34
|
GOOGLE_GEMINI_MODEL_TAG: str = "GOOGLE_GEMINI_MODEL_TAG"
|
|
35
|
+
GOOGLE_GEMINI_PRO_VISION_V1_TAG: str = "GOOGLE_GEMINI_PRO_VISION_V1_TAG"
|
|
35
36
|
GOOGLE_GEMMA_INSTRUCT_MODEL_TAG: str = "GOOGLE_GEMMA_INSTRUCT_MODEL_TAG"
|
|
36
37
|
|
|
37
38
|
# Models which emit garbage tokens when temperature=0.
|
|
@@ -159,7 +160,10 @@ def register_model_metadata(model_metadata: ModelMetadata) -> None:
|
|
|
159
160
|
def get_model_metadata(model_name: str) -> ModelMetadata:
|
|
160
161
|
"""Return the `ModelMetadata` for the model name."""
|
|
161
162
|
if model_name not in MODEL_NAME_TO_MODEL_METADATA:
|
|
162
|
-
raise ValueError(
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"No model metadata for model name: {model_name} - "
|
|
165
|
+
"did you remember to add this model to model_metadata.yaml?"
|
|
166
|
+
)
|
|
163
167
|
|
|
164
168
|
return MODEL_NAME_TO_MODEL_METADATA[model_name]
|
|
165
169
|
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -8,12 +8,14 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
8
8
|
get_all_code_models,
|
|
9
9
|
get_all_models,
|
|
10
10
|
get_all_text_models,
|
|
11
|
+
get_model_metadata,
|
|
11
12
|
get_model_names_with_tag,
|
|
12
13
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
13
14
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
14
15
|
ABLATION_MODEL_TAG,
|
|
15
16
|
TEXT_TO_IMAGE_MODEL_TAG,
|
|
16
17
|
VISION_LANGUAGE_MODEL_TAG,
|
|
18
|
+
INSTRUCTION_FOLLOWING_MODEL_TAG,
|
|
17
19
|
)
|
|
18
20
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
19
21
|
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
@@ -322,6 +324,16 @@ class AnthropicClaude3RunExpander(RunExpander):
|
|
|
322
324
|
name = "claude_3"
|
|
323
325
|
|
|
324
326
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
327
|
+
# Remove all stop sequences that do not contain non-whitespace characters.
|
|
328
|
+
# This prevents the Anthropic API from returnin the following error:
|
|
329
|
+
# "stop_sequences: each stop sequence must contain non-whitespace"
|
|
330
|
+
stop_sequences_with_non_whitespace = [
|
|
331
|
+
stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
|
|
332
|
+
]
|
|
333
|
+
run_spec = replace(
|
|
334
|
+
run_spec,
|
|
335
|
+
adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
|
|
336
|
+
)
|
|
325
337
|
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
326
338
|
instructions = "Answer with only a single letter."
|
|
327
339
|
if run_spec.adapter_spec.instructions:
|
|
@@ -335,78 +347,37 @@ class AnthropicClaude3RunExpander(RunExpander):
|
|
|
335
347
|
return [run_spec]
|
|
336
348
|
|
|
337
349
|
|
|
338
|
-
class
|
|
339
|
-
"""
|
|
340
|
-
Custom prompt for OpenAI models.
|
|
341
|
-
These models need more explicit instructions about following the format.
|
|
342
|
-
"""
|
|
343
|
-
|
|
344
|
-
# TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
|
|
345
|
-
|
|
346
|
-
name = "openai"
|
|
350
|
+
class FollowFormatInstructionsRunExpander(RunExpander):
|
|
351
|
+
"""Adds more explicit instructions about following the format to prompts.
|
|
347
352
|
|
|
348
|
-
|
|
349
|
-
|
|
353
|
+
The argument controlls which models will receive these instructions.
|
|
354
|
+
If "all", all models receive these instructions.
|
|
355
|
+
If "instruct", only instruction-following models receive these instructions.
|
|
350
356
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
return [run_spec]
|
|
354
|
-
|
|
355
|
-
return [
|
|
356
|
-
replace(
|
|
357
|
-
run_spec,
|
|
358
|
-
name=run_spec.name,
|
|
359
|
-
adapter_spec=replace(
|
|
360
|
-
run_spec.adapter_spec,
|
|
361
|
-
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
|
|
362
|
-
global_suffix="\n\n"
|
|
363
|
-
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
364
|
-
+ "\n"
|
|
365
|
-
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
366
|
-
),
|
|
367
|
-
),
|
|
368
|
-
]
|
|
357
|
+
Only supports the generation adaptation method. Raises an error if used on
|
|
358
|
+
a RunSpec that uses a different adaptation method.
|
|
369
359
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
These models need more explicit instructions about following the format.
|
|
360
|
+
Note: For legacy backwards compatibility reasons, despite the use of the word
|
|
361
|
+
"instructions" in this run expander's name, this run expander actually
|
|
362
|
+
modifies the global_prefix and the global_suffix of the AdapterSpec rather than
|
|
363
|
+
the instructions.
|
|
375
364
|
"""
|
|
376
365
|
|
|
377
|
-
|
|
366
|
+
name = "follow_format_instructions"
|
|
378
367
|
|
|
379
|
-
|
|
368
|
+
def __init__(self, value: str):
|
|
369
|
+
if value != "all" and value != "instruct":
|
|
370
|
+
raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
|
|
371
|
+
self.value = value
|
|
380
372
|
|
|
381
373
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
382
374
|
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
return [
|
|
386
|
-
replace(
|
|
387
|
-
run_spec,
|
|
388
|
-
name=run_spec.name,
|
|
389
|
-
adapter_spec=replace(
|
|
390
|
-
run_spec.adapter_spec,
|
|
391
|
-
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
|
|
392
|
-
global_suffix="\n\n"
|
|
393
|
-
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
394
|
-
+ "\n"
|
|
395
|
-
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
396
|
-
),
|
|
397
|
-
),
|
|
398
|
-
]
|
|
375
|
+
raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
|
|
399
376
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
# TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
|
|
405
|
-
|
|
406
|
-
name = "output_format_instructions"
|
|
407
|
-
|
|
408
|
-
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
409
|
-
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
377
|
+
if (
|
|
378
|
+
self.value == "instruct"
|
|
379
|
+
and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
|
|
380
|
+
):
|
|
410
381
|
return [run_spec]
|
|
411
382
|
|
|
412
383
|
return [
|
|
@@ -539,7 +510,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
539
510
|
"one": [1],
|
|
540
511
|
"all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
|
|
541
512
|
"big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
|
|
542
|
-
"
|
|
513
|
+
"vhelm": [0, 1, 2, 4, 8],
|
|
543
514
|
}
|
|
544
515
|
|
|
545
516
|
|
|
@@ -1415,6 +1386,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1415
1386
|
NewlineRunExpander,
|
|
1416
1387
|
StopRunExpander,
|
|
1417
1388
|
FormatPromptRunExpander,
|
|
1389
|
+
FollowFormatInstructionsRunExpander,
|
|
1418
1390
|
AddToStopRunExpander,
|
|
1419
1391
|
GlobalPrefixRunExpander,
|
|
1420
1392
|
NumTrainTrialsRunExpander,
|
|
@@ -4,7 +4,6 @@ from typing import List
|
|
|
4
4
|
from helm.benchmark.adaptation.adapter_spec import (
|
|
5
5
|
ADAPT_GENERATION,
|
|
6
6
|
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
7
|
-
ADAPT_GENERATION_MULTIMODAL,
|
|
8
7
|
)
|
|
9
8
|
from helm.benchmark.model_deployment_registry import (
|
|
10
9
|
ModelDeployment,
|
|
@@ -14,22 +13,24 @@ from helm.benchmark.model_deployment_registry import (
|
|
|
14
13
|
from helm.benchmark.model_metadata_registry import (
|
|
15
14
|
ANTHROPIC_CLAUDE_1_MODEL_TAG,
|
|
16
15
|
ANTHROPIC_CLAUDE_2_MODEL_TAG,
|
|
16
|
+
ANTHROPIC_CLAUDE_3_MODEL_TAG,
|
|
17
17
|
BUGGY_TEMP_0_TAG,
|
|
18
18
|
CHATML_MODEL_TAG,
|
|
19
|
-
|
|
19
|
+
GOOGLE_GEMINI_PRO_VISION_V1_TAG,
|
|
20
20
|
IDEFICS_INSTRUCT_MODEL_TAG,
|
|
21
|
-
IDEFICS_MODEL_TAG,
|
|
22
21
|
LLAVA_MODEL_TAG,
|
|
23
22
|
OPEN_FLAMINGO_MODEL_TAG,
|
|
24
|
-
VISION_LANGUAGE_MODEL_TAG,
|
|
25
23
|
NLG_PREFIX_TAG,
|
|
26
24
|
NO_NEWLINES_TAG,
|
|
25
|
+
VISION_LANGUAGE_MODEL_TAG,
|
|
26
|
+
IDEFICS_MODEL_TAG,
|
|
27
27
|
ModelMetadata,
|
|
28
28
|
get_model_metadata,
|
|
29
29
|
)
|
|
30
30
|
from helm.benchmark.run_expander import (
|
|
31
31
|
RUN_EXPANDERS,
|
|
32
32
|
AnthropicClaude2RunExpander,
|
|
33
|
+
AnthropicClaude3RunExpander,
|
|
33
34
|
ChatMLRunExpander,
|
|
34
35
|
GlobalPrefixRunExpander,
|
|
35
36
|
IDEFICSInstructRunExpander,
|
|
@@ -125,20 +126,20 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
125
126
|
if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
|
|
126
127
|
run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
|
|
127
128
|
|
|
128
|
-
#
|
|
129
|
+
# Anthropic Claude 3
|
|
130
|
+
if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
|
|
131
|
+
run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
|
|
132
|
+
|
|
133
|
+
# Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
|
|
129
134
|
if (
|
|
130
135
|
VISION_LANGUAGE_MODEL_TAG in model.tags
|
|
131
|
-
and
|
|
136
|
+
and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
|
|
132
137
|
and run_spec.adapter_spec.max_tokens == 1
|
|
133
138
|
):
|
|
134
139
|
run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
|
|
135
140
|
|
|
136
141
|
# IDEFICS special handling
|
|
137
142
|
if IDEFICS_MODEL_TAG in model.tags:
|
|
138
|
-
# IDEFICS requires more `max_tokens` to generate something reasonable for open-ended generation
|
|
139
|
-
if run_spec.adapter_spec.method == ADAPT_GENERATION_MULTIMODAL:
|
|
140
|
-
run_spec = singleton(IncreaseMaxTokensRunExpander(value=30).expand(run_spec))
|
|
141
|
-
|
|
142
143
|
if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
|
|
143
144
|
run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
|
|
144
145
|
|