crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +71 -0
- helm/benchmark/annotation/medication_qa_annotator.py +68 -0
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +31 -2
- helm/benchmark/run_expander.py +113 -10
- helm/benchmark/run_spec_factory.py +4 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
- helm/benchmark/run_specs/experimental_run_specs.py +85 -0
- helm/benchmark/run_specs/finance_run_specs.py +110 -0
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +251 -57
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +189 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +317 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
- helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +50 -28
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +79 -19
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +11 -5
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +7 -9
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +129 -23
- helm/clients/vertexai_client.py +62 -18
- helm/clients/vision_language/huggingface_vlm_client.py +1 -0
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +99 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +25 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +740 -363
- helm/config/model_metadata.yaml +824 -128
- helm/config/tokenizer_configs.yaml +207 -10
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +2 -3
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +29 -62
- helm/tokenizers/huggingface_tokenizer.py +35 -13
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/schema_image2structure.yaml +0 -304
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
- helm/benchmark/static_build/assets/index-878a1094.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -28,14 +28,14 @@ from helm.benchmark.metrics.vision_language.image_utils import (
|
|
|
28
28
|
pixel_similarity,
|
|
29
29
|
sift_similarity,
|
|
30
30
|
)
|
|
31
|
-
from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color
|
|
31
|
+
from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive, get_most_frequent_color, to_gray
|
|
32
32
|
|
|
33
33
|
try:
|
|
34
34
|
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
|
|
35
35
|
from PIL import Image
|
|
36
36
|
import imagehash
|
|
37
37
|
except ModuleNotFoundError as e:
|
|
38
|
-
handle_module_not_found_error(e, suggestions=["
|
|
38
|
+
handle_module_not_found_error(e, suggestions=["image2struct"])
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def pad(small_image: Image.Image, large_image: Image.Image, axis: int) -> Image.Image:
|
|
@@ -78,9 +78,8 @@ class AnnotatedImageMetrics(Metric):
|
|
|
78
78
|
|
|
79
79
|
# Metric names
|
|
80
80
|
COMPILE_METRIC: str = "compilation_success"
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
BLOCK_EARTH_MOVER_SIMILARITY: str = "block_emd_similarity"
|
|
81
|
+
EARTH_MOVER_SIMILARITY = "earth_mover_similarity"
|
|
82
|
+
BLOCK_EMD: str = "block_emd"
|
|
84
83
|
PIXEL_SIMILARITY: str = "pixel_similarity"
|
|
85
84
|
SIFT_SIMILARITY: str = "sift_similarity"
|
|
86
85
|
LPIPS_SIMILARITY: str = "lpips_similarity"
|
|
@@ -108,12 +107,10 @@ class AnnotatedImageMetrics(Metric):
|
|
|
108
107
|
metrics: List[AnnotatedMetric] = [
|
|
109
108
|
AnnotatedMetric(self.PIXEL_SIMILARITY, pixel_similarity, "image_np_gray"),
|
|
110
109
|
AnnotatedMetric(self.SIFT_SIMILARITY, sift_similarity, "image_np"),
|
|
111
|
-
# Raw block
|
|
112
|
-
AnnotatedMetric(
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# Normalized block EMD against median
|
|
116
|
-
AnnotatedMetric(self.BLOCK_EARTH_MOVER_SIMILARITY_NORM2, self.compute_block_emd_median, "image_PIL"),
|
|
110
|
+
AnnotatedMetric(self.BLOCK_EMD, self.compute_block_emd_raw, "image_PIL"), # Raw block-EMD
|
|
111
|
+
AnnotatedMetric(
|
|
112
|
+
self.EARTH_MOVER_SIMILARITY, self.ems, "image_PIL"
|
|
113
|
+
), # Normalized block-EMD against black/white
|
|
117
114
|
AnnotatedMetric(self.LPIPS_SIMILARITY, self.lpips_similarity, "image_PIL"),
|
|
118
115
|
AnnotatedMetric(self.FID_SIMILARITY, self.fid_similarity, "image_PIL"),
|
|
119
116
|
AnnotatedMetric(self.SSIM_SIMILARITY, self.compute_ssim, "image_np_gray"),
|
|
@@ -391,9 +388,15 @@ class AnnotatedImageMetrics(Metric):
|
|
|
391
388
|
features1 = self._get_inception_features(img1_tensor)
|
|
392
389
|
features2 = self._get_inception_features(img2_tensor)
|
|
393
390
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
391
|
+
# TODO: Justify the value of the constant here or remove this code to only keep the cosine similarity.
|
|
392
|
+
# fid_score = self._calculate_fid(features1, features2)
|
|
393
|
+
# normalize_fid: float = np.exp(-fid_score * self.NORMALIZE_FID_FACTOR)
|
|
394
|
+
# return normalize_fid
|
|
395
|
+
|
|
396
|
+
# Use the cosine similarity between the features as a proxy for FID
|
|
397
|
+
# Return a score between 0 and 1, where 1 is the most similar
|
|
398
|
+
score = 0.5 * (1 + np.dot(features1[0], features2[0]) / (np.linalg.norm(features1) * np.linalg.norm(features2)))
|
|
399
|
+
return score
|
|
397
400
|
|
|
398
401
|
def compute_ssim(self, generated_image: np.ndarray, reference_image: np.ndarray) -> float:
|
|
399
402
|
"""Compute the Structural Similarity Index (SSIM) between the generated and reference images."""
|
|
@@ -414,58 +417,7 @@ class AnnotatedImageMetrics(Metric):
|
|
|
414
417
|
result = _edit_similarity(completion_tokens, truncated_reference_tokens)
|
|
415
418
|
return result
|
|
416
419
|
|
|
417
|
-
def
|
|
418
|
-
self,
|
|
419
|
-
pred_image: Image.Image,
|
|
420
|
-
ref_image: Image.Image,
|
|
421
|
-
threshold_most_frequent_color: float = 0.5,
|
|
422
|
-
patch_size: Tuple[int, int] = (8, 8),
|
|
423
|
-
max_num_patches: int = 100,
|
|
424
|
-
weight_most_frequent_color: float = 0.001,
|
|
425
|
-
use_tqdm: bool = False,
|
|
426
|
-
):
|
|
427
|
-
"""Computes the block Earth Moving Distance (EMD). This attempts to
|
|
428
|
-
speed up EMD for images with huge areas by considering movement/transformatio
|
|
429
|
-
of blocks of pixels. The score is normalized against EMD against white images
|
|
430
|
-
"""
|
|
431
|
-
|
|
432
|
-
def compute_numerator():
|
|
433
|
-
return self.compute_block_emd_raw_wrapper(
|
|
434
|
-
pred_image,
|
|
435
|
-
ref_image,
|
|
436
|
-
threshold_most_frequent_color,
|
|
437
|
-
patch_size,
|
|
438
|
-
max_num_patches,
|
|
439
|
-
weight_most_frequent_color,
|
|
440
|
-
use_tqdm,
|
|
441
|
-
)
|
|
442
|
-
|
|
443
|
-
def compute_denominator():
|
|
444
|
-
constant_image = Image.new("RGB", ref_image.size, (255, 255, 255)) # default color is white
|
|
445
|
-
value = compute_emd_recursive(
|
|
446
|
-
constant_image,
|
|
447
|
-
ref_image,
|
|
448
|
-
threshold_most_frequent_color,
|
|
449
|
-
patch_size,
|
|
450
|
-
max_num_patches,
|
|
451
|
-
weight_most_frequent_color,
|
|
452
|
-
use_tqdm,
|
|
453
|
-
)
|
|
454
|
-
return {"value": value}
|
|
455
|
-
|
|
456
|
-
hash_dict = {
|
|
457
|
-
"reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
|
|
458
|
-
}
|
|
459
|
-
cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY}", **hash_dict}
|
|
460
|
-
cache_key_denominator = {"metric_name": f"intermediate_{self.BLOCK_EARTH_MOVER_SIMILARITY_NORM1}", **hash_dict}
|
|
461
|
-
|
|
462
|
-
assert self._cache is not None
|
|
463
|
-
emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
|
|
464
|
-
emd_base, _ = self._cache.get(cache_key_denominator, compute_denominator)
|
|
465
|
-
|
|
466
|
-
return 1.0 - emd_raw["value"] / emd_base["value"]
|
|
467
|
-
|
|
468
|
-
def compute_block_emd_median(
|
|
420
|
+
def ems(
|
|
469
421
|
self,
|
|
470
422
|
pred_image: Image.Image,
|
|
471
423
|
ref_image: Image.Image,
|
|
@@ -493,9 +445,13 @@ class AnnotatedImageMetrics(Metric):
|
|
|
493
445
|
def compute_denominator():
|
|
494
446
|
ref_img_np = np.array(ref_image)
|
|
495
447
|
(rgb_most_frequent_color, _) = get_most_frequent_color(ref_img_np)
|
|
448
|
+
grayscale_most_frequent_color = to_gray(rgb_most_frequent_color)[0]
|
|
496
449
|
|
|
497
450
|
# Most frequent color as base
|
|
498
|
-
|
|
451
|
+
if grayscale_most_frequent_color < 127:
|
|
452
|
+
constant_image = Image.new("RGB", ref_image.size, (255, 255, 255)) # Make it white
|
|
453
|
+
else:
|
|
454
|
+
constant_image = Image.new("RGB", ref_image.size, (0, 0, 0)) # Make it black
|
|
499
455
|
value = compute_emd_recursive(
|
|
500
456
|
constant_image,
|
|
501
457
|
ref_image,
|
|
@@ -509,9 +465,10 @@ class AnnotatedImageMetrics(Metric):
|
|
|
509
465
|
|
|
510
466
|
hash_dict = {
|
|
511
467
|
"reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
|
|
468
|
+
"generated_image": str(AnnotatedImageMetrics.HASH_FUNC(pred_image, hash_size=self.HASH_LENGTH)),
|
|
512
469
|
}
|
|
513
|
-
cache_key_numerator = {"metric_name": f"intermediate_{self.
|
|
514
|
-
cache_key_denominator = {"metric_name":
|
|
470
|
+
cache_key_numerator = {"metric_name": f"intermediate_{self.BLOCK_EMD}", **hash_dict}
|
|
471
|
+
cache_key_denominator = {"metric_name": "intermediate_ems_extreme_denominator", **hash_dict}
|
|
515
472
|
|
|
516
473
|
assert self._cache is not None
|
|
517
474
|
emd_raw, _ = self._cache.get(cache_key_numerator, compute_numerator)
|
|
@@ -542,8 +499,9 @@ class AnnotatedImageMetrics(Metric):
|
|
|
542
499
|
|
|
543
500
|
hash_dict = {
|
|
544
501
|
"reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
|
|
502
|
+
"generated_image": str(AnnotatedImageMetrics.HASH_FUNC(pred_image, hash_size=self.HASH_LENGTH)),
|
|
545
503
|
}
|
|
546
|
-
cache_key = {"metric_name": f"intermediate_{self.
|
|
504
|
+
cache_key = {"metric_name": f"intermediate_{self.BLOCK_EMD}", **hash_dict}
|
|
547
505
|
assert self._cache is not None
|
|
548
506
|
emd_raw, _ = self._cache.get(cache_key, compute)
|
|
549
507
|
|
|
@@ -560,8 +518,8 @@ class AnnotatedImageMetrics(Metric):
|
|
|
560
518
|
use_tqdm: bool = False,
|
|
561
519
|
):
|
|
562
520
|
"""Computes the block Earth Moving Distance (EMD). This attempts to
|
|
563
|
-
speed up EMD for images with huge areas by considering
|
|
564
|
-
of blocks of pixels.
|
|
521
|
+
speed up EMD for images with huge areas by considering
|
|
522
|
+
movement/transformation of blocks of pixels.
|
|
565
523
|
"""
|
|
566
524
|
emd_value = compute_emd_recursive(
|
|
567
525
|
pred_image,
|
|
@@ -6,7 +6,7 @@ try:
|
|
|
6
6
|
import cv2
|
|
7
7
|
from PIL.Image import Image
|
|
8
8
|
except ModuleNotFoundError as e:
|
|
9
|
-
handle_module_not_found_error(e, suggestions=["
|
|
9
|
+
handle_module_not_found_error(e, suggestions=["image2struct"])
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def preprocess_image(image: Image) -> np.ndarray:
|
|
@@ -22,9 +22,6 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
|
|
|
22
22
|
# OpenAI Chat format
|
|
23
23
|
OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
|
|
24
24
|
|
|
25
|
-
# Mistral instruction-following format
|
|
26
|
-
MISTRAL_MODEL_TAG: str = "MISTRAL_MODEL_TAG"
|
|
27
|
-
|
|
28
25
|
# For Anthropic models
|
|
29
26
|
ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
|
|
30
27
|
ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
|
|
@@ -69,6 +66,9 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
|
|
|
69
66
|
LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
|
|
70
67
|
FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
|
|
71
68
|
|
|
69
|
+
# Deprecated models that are no longer available.
|
|
70
|
+
# These are usually closed API models that have been permanently removed
|
|
71
|
+
DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
|
|
72
72
|
|
|
73
73
|
# Frozen is set to false as the model_deployment_registry.py file
|
|
74
74
|
# might populate the deployment_names field.
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import dataclasses
|
|
1
3
|
from dataclasses import dataclass, field
|
|
2
4
|
from typing import List, Optional, Dict
|
|
3
5
|
import dacite
|
|
6
|
+
from inspect import cleandoc
|
|
4
7
|
import mako.template
|
|
5
8
|
import yaml
|
|
6
9
|
import importlib_resources as resources
|
|
@@ -17,6 +20,11 @@ SCHEMA_YAML_PACKAGE: str = "helm.benchmark.static"
|
|
|
17
20
|
SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
|
|
18
21
|
|
|
19
22
|
|
|
23
|
+
_ADAPTER_SPEC_PACKAGE = "helm.benchmark.adaptation"
|
|
24
|
+
_ADAPTER_SPEC_FILENAME = "adapter_spec.py"
|
|
25
|
+
_ADAPTER_SPEC_CLASS_NAME = "AdapterSpec"
|
|
26
|
+
|
|
27
|
+
|
|
20
28
|
@dataclass(frozen=True)
|
|
21
29
|
class Field:
|
|
22
30
|
"""
|
|
@@ -198,9 +206,6 @@ class RunGroup(Field):
|
|
|
198
206
|
class Schema:
|
|
199
207
|
"""Specifies information about what to display on the frontend."""
|
|
200
208
|
|
|
201
|
-
# Adapter fields (e.g., temperature)
|
|
202
|
-
adapter: List[Field]
|
|
203
|
-
|
|
204
209
|
# Information about each field
|
|
205
210
|
metrics: List[Field]
|
|
206
211
|
|
|
@@ -213,6 +218,11 @@ class Schema:
|
|
|
213
218
|
# Group the scenarios
|
|
214
219
|
run_groups: List[RunGroup]
|
|
215
220
|
|
|
221
|
+
# Adapter fields (e.g., temperature)
|
|
222
|
+
# Automatically populated from the docstrings in the AdapterSpec class definition.
|
|
223
|
+
# Should not be specified in the user's YAML file.
|
|
224
|
+
adapter: Optional[List[Field]] = None
|
|
225
|
+
|
|
216
226
|
def __post_init__(self):
|
|
217
227
|
self.name_to_metric = {metric.name: metric for metric in self.metrics}
|
|
218
228
|
self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
|
|
@@ -220,6 +230,43 @@ class Schema:
|
|
|
220
230
|
self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
|
|
221
231
|
|
|
222
232
|
|
|
233
|
+
def get_adapter_fields() -> List[Field]:
|
|
234
|
+
"""Generate the adapter fields from the docstrings in the AdapterSpec class definition."""
|
|
235
|
+
# Unfortunately there is no standard library support for getting docstrings of class fields,
|
|
236
|
+
# so we have to do the parsing outselves. Fortunately, the parsing is quite straightforward.
|
|
237
|
+
adapter_spec_path = resources.files(_ADAPTER_SPEC_PACKAGE).joinpath(_ADAPTER_SPEC_FILENAME)
|
|
238
|
+
with open(adapter_spec_path, "r") as f:
|
|
239
|
+
contents = f.read()
|
|
240
|
+
module_node = ast.parse(contents)
|
|
241
|
+
adapter_spec_node = [
|
|
242
|
+
node
|
|
243
|
+
for node in ast.iter_child_nodes(module_node)
|
|
244
|
+
if isinstance(node, ast.ClassDef) and node.name == _ADAPTER_SPEC_CLASS_NAME
|
|
245
|
+
][0]
|
|
246
|
+
metadata_fields: List[Field] = []
|
|
247
|
+
field_name: str = ""
|
|
248
|
+
for node in ast.iter_child_nodes(adapter_spec_node):
|
|
249
|
+
if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
|
|
250
|
+
# This node is a field definition.
|
|
251
|
+
# Save the name of the field for later.
|
|
252
|
+
field_name = node.target.id
|
|
253
|
+
else:
|
|
254
|
+
# If this is a docstring that immediately follows a field definition,
|
|
255
|
+
# output an adapter field with the name set to the field definition and
|
|
256
|
+
# the description set to the docstring.
|
|
257
|
+
if (
|
|
258
|
+
field_name
|
|
259
|
+
and isinstance(node, ast.Expr)
|
|
260
|
+
and isinstance(node.value, ast.Constant)
|
|
261
|
+
and isinstance(node.value.value, str)
|
|
262
|
+
):
|
|
263
|
+
description = cleandoc(node.value.value).replace("\n", " ")
|
|
264
|
+
metadata_fields.append(Field(name=field_name, description=description))
|
|
265
|
+
field_name = ""
|
|
266
|
+
|
|
267
|
+
return metadata_fields
|
|
268
|
+
|
|
269
|
+
|
|
223
270
|
def get_default_schema_path() -> str:
|
|
224
271
|
return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
|
|
225
272
|
|
|
@@ -229,4 +276,7 @@ def read_schema(schema_path: str) -> Schema:
|
|
|
229
276
|
hlog(f"Reading schema file {schema_path}...")
|
|
230
277
|
with open(schema_path, "r") as f:
|
|
231
278
|
raw = yaml.safe_load(f)
|
|
232
|
-
|
|
279
|
+
schema = dacite.from_dict(Schema, raw)
|
|
280
|
+
if schema.adapter:
|
|
281
|
+
hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
|
|
282
|
+
return dataclasses.replace(schema, adapter=get_adapter_fields())
|
|
@@ -16,6 +16,7 @@ class TestRunEntry:
|
|
|
16
16
|
|
|
17
17
|
@pytest.mark.parametrize("fname", list_fnames())
|
|
18
18
|
def test_read_all_specs(self, fname: str):
|
|
19
|
+
pytest.skip("Skipping slow tests")
|
|
19
20
|
run_entries = read_run_entries([fname])
|
|
20
21
|
for entry in run_entries.entries:
|
|
21
22
|
construct_run_specs(parse_object_spec(entry.description))
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from helm.benchmark.presentation.schema import get_adapter_fields
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_get_adapter_fields() -> None:
|
|
5
|
+
adapter_fields = get_adapter_fields()
|
|
6
|
+
assert adapter_fields
|
|
7
|
+
assert adapter_fields[0].name == "method"
|
|
8
|
+
assert (
|
|
9
|
+
adapter_fields[0].description
|
|
10
|
+
== "The high-level strategy for converting instances into a prompt for the language model."
|
|
11
|
+
)
|
helm/benchmark/run.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
from dataclasses import replace
|
|
3
3
|
import os
|
|
4
|
+
import re
|
|
4
5
|
from typing import List, Optional
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
from helm.benchmark import model_metadata_registry
|
|
7
9
|
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
8
10
|
from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
9
11
|
from helm.common.general import ensure_directory_exists
|
|
@@ -264,6 +266,13 @@ def main():
|
|
|
264
266
|
default=None,
|
|
265
267
|
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
266
268
|
)
|
|
269
|
+
parser.add_argument(
|
|
270
|
+
"--openvino",
|
|
271
|
+
action="store_true",
|
|
272
|
+
default=False,
|
|
273
|
+
help="Experimental: Apply openvino optimization to Hugging Face AutoModelForCausalLM models "
|
|
274
|
+
"specified with the --enable-huggingface-models and --enable-local-huggingface-models flags.",
|
|
275
|
+
)
|
|
267
276
|
add_run_args(parser)
|
|
268
277
|
args = parser.parse_args()
|
|
269
278
|
validate_args(args)
|
|
@@ -275,12 +284,19 @@ def main():
|
|
|
275
284
|
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
276
285
|
|
|
277
286
|
for huggingface_model_name in args.enable_huggingface_models:
|
|
278
|
-
|
|
287
|
+
if args.openvino:
|
|
288
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name, args.openvino)
|
|
289
|
+
else:
|
|
290
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
291
|
+
|
|
279
292
|
if args.enable_local_huggingface_models:
|
|
280
293
|
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
281
294
|
|
|
282
295
|
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
283
|
-
|
|
296
|
+
if args.openvino:
|
|
297
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path, args.openvino)
|
|
298
|
+
else:
|
|
299
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
284
300
|
|
|
285
301
|
run_entries: List[RunEntry] = []
|
|
286
302
|
if args.conf_paths:
|
|
@@ -300,6 +316,19 @@ def main():
|
|
|
300
316
|
ensure_directory_exists(args.output_path)
|
|
301
317
|
set_benchmark_output_path(args.output_path)
|
|
302
318
|
|
|
319
|
+
# Validate the --models-to-run flag
|
|
320
|
+
if args.models_to_run:
|
|
321
|
+
all_models = set(model_metadata_registry.get_all_models())
|
|
322
|
+
for model_to_run in args.models_to_run:
|
|
323
|
+
if model_to_run not in all_models:
|
|
324
|
+
raise Exception(f"Unknown model '{model_to_run}' passed to --models-to-run")
|
|
325
|
+
else:
|
|
326
|
+
model_expander_pattern = re.compile(
|
|
327
|
+
r"\bmodel=(?:all|text_code|text|code|instruction_following|full_functionality_text|limited_functionality_text)\b" # noqa: E501
|
|
328
|
+
)
|
|
329
|
+
if any(model_expander_pattern.search(run_entry.description) for run_entry in run_entries):
|
|
330
|
+
raise Exception("--models-to-run must be set if the `models=` run expander expands to multiple models")
|
|
331
|
+
|
|
303
332
|
run_specs = run_entries_to_run_specs(
|
|
304
333
|
run_entries=run_entries,
|
|
305
334
|
max_eval_instances=args.max_eval_instances,
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -10,6 +10,7 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
10
10
|
get_all_text_models,
|
|
11
11
|
get_model_metadata,
|
|
12
12
|
get_model_names_with_tag,
|
|
13
|
+
DEPRECATED_MODEL_TAG,
|
|
13
14
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
14
15
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
15
16
|
ABLATION_MODEL_TAG,
|
|
@@ -194,6 +195,15 @@ class StopRunExpander(RunExpander):
|
|
|
194
195
|
self.value = value
|
|
195
196
|
|
|
196
197
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
198
|
+
if self.value == "none":
|
|
199
|
+
return [
|
|
200
|
+
replace(
|
|
201
|
+
run_spec,
|
|
202
|
+
name=f"{run_spec.name},{self.name}={self.value}",
|
|
203
|
+
adapter_spec=replace(run_spec.adapter_spec, stop_sequences=[]),
|
|
204
|
+
),
|
|
205
|
+
]
|
|
206
|
+
|
|
197
207
|
if self.value == "hash":
|
|
198
208
|
stop = "###"
|
|
199
209
|
elif self.value == "semicolon":
|
|
@@ -334,16 +344,6 @@ class AnthropicClaude3RunExpander(RunExpander):
|
|
|
334
344
|
run_spec,
|
|
335
345
|
adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
|
|
336
346
|
)
|
|
337
|
-
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
338
|
-
instructions = "Answer with only a single letter."
|
|
339
|
-
if run_spec.adapter_spec.instructions:
|
|
340
|
-
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
341
|
-
return [
|
|
342
|
-
replace(
|
|
343
|
-
run_spec,
|
|
344
|
-
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
345
|
-
),
|
|
346
|
-
]
|
|
347
347
|
return [run_spec]
|
|
348
348
|
|
|
349
349
|
|
|
@@ -601,6 +601,12 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
601
601
|
values_dict["ablation"] = models
|
|
602
602
|
else:
|
|
603
603
|
values_dict[family_name] = models
|
|
604
|
+
|
|
605
|
+
# For each of the keys above, filter out deprecated models.
|
|
606
|
+
deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
|
|
607
|
+
for family_name in values_dict.keys():
|
|
608
|
+
values_dict[family_name] = [model for model in values_dict[family_name] if model not in deprecated_models]
|
|
609
|
+
|
|
604
610
|
return values_dict
|
|
605
611
|
|
|
606
612
|
|
|
@@ -1035,6 +1041,7 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
|
|
|
1035
1041
|
"chinese": {"chinese": [translate(language_code="zh-CN")]},
|
|
1036
1042
|
"hindi": {"hindi": [translate(language_code="hi")]},
|
|
1037
1043
|
"spanish": {"spanish": [translate(language_code="es")]},
|
|
1044
|
+
"swahili": {"swahili": [translate(language_code="sw")]},
|
|
1038
1045
|
# Styles
|
|
1039
1046
|
"art": {
|
|
1040
1047
|
"art": [
|
|
@@ -1380,6 +1387,101 @@ class ChatMLRunExpander(RunExpander):
|
|
|
1380
1387
|
]
|
|
1381
1388
|
|
|
1382
1389
|
|
|
1390
|
+
class OutputFormatInstructions(RunExpander):
|
|
1391
|
+
"""Add extra instructions to about output formatting to HELM Lite scenarios.
|
|
1392
|
+
|
|
1393
|
+
Many instruction-following models and chat models are tuned to expect conversational prompts
|
|
1394
|
+
and respond in a conversational way. These models occasionally produce outputs that are not
|
|
1395
|
+
in the expected format. This run expander instructs these models to provide the output in
|
|
1396
|
+
the format expected by the scenario.
|
|
1397
|
+
|
|
1398
|
+
The argument should be the name of the scenario."""
|
|
1399
|
+
|
|
1400
|
+
name = "output_format_instructions"
|
|
1401
|
+
|
|
1402
|
+
_SUFFIX_SUFFIX = "_suffix"
|
|
1403
|
+
|
|
1404
|
+
def __init__(self, scenario: str):
|
|
1405
|
+
if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
|
|
1406
|
+
self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
|
|
1407
|
+
self.suffix = True
|
|
1408
|
+
else:
|
|
1409
|
+
self.scenario = scenario
|
|
1410
|
+
self.suffix = False
|
|
1411
|
+
|
|
1412
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
1413
|
+
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
1414
|
+
if self.scenario == "mmlu_only_last_question":
|
|
1415
|
+
instructions = "Answer only the last question with only a single letter."
|
|
1416
|
+
elif self.scenario == "mmlu":
|
|
1417
|
+
instructions = "Answer with only a single letter."
|
|
1418
|
+
elif self.scenario == "mcqa":
|
|
1419
|
+
instructions = "Answer with only a single letter."
|
|
1420
|
+
else:
|
|
1421
|
+
instructions = "Answer with only a single letter."
|
|
1422
|
+
elif run_spec.adapter_spec.method == ADAPT_GENERATION:
|
|
1423
|
+
output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
|
|
1424
|
+
if self.scenario == "narrative_qa":
|
|
1425
|
+
instructions = (
|
|
1426
|
+
"Answer with one word, a few-word phrase, or a short sentence. "
|
|
1427
|
+
+ "Avoid extra, unnecessary information in the answer."
|
|
1428
|
+
)
|
|
1429
|
+
elif self.scenario == "natural_qa":
|
|
1430
|
+
instructions = "Answer with a short answer or a boolean 'yes' or 'no' answer."
|
|
1431
|
+
elif self.scenario == "legalbench":
|
|
1432
|
+
if output_noun != "Answer":
|
|
1433
|
+
instructions = f"Answer with the {output_noun.lower()}."
|
|
1434
|
+
else:
|
|
1435
|
+
instructions = "Answer yes or no."
|
|
1436
|
+
elif self.scenario == "legalbench_abercrombie":
|
|
1437
|
+
instructions = "Answer with only 'generic', 'descriptive', 'suggestive', 'arbitrary' or 'fanciful'."
|
|
1438
|
+
elif self.scenario == "legalbench_function_of_decision_section":
|
|
1439
|
+
instructions = "Answer with only 'Facts', 'Procedural History', 'Issue', 'Rule', 'Analysis', 'Conclusion' or 'Decree'." # noqa: E501
|
|
1440
|
+
elif self.scenario == "legalbench_yes_or_no":
|
|
1441
|
+
instructions = "Answer with only 'Yes' or 'No'."
|
|
1442
|
+
elif self.scenario == "wmt_14":
|
|
1443
|
+
instructions = "Answer with the English translation."
|
|
1444
|
+
elif self.scenario == "wmt_14_only_last_sentence":
|
|
1445
|
+
instructions = "Answer with only the English translation for the last sentence."
|
|
1446
|
+
elif self.scenario == "math":
|
|
1447
|
+
instructions = "Wrap the final answer with the \\boxed{} command."
|
|
1448
|
+
elif self.scenario == "numeric_nlg":
|
|
1449
|
+
instructions = "Answer with only description of the last table as a single paragraph on a single line."
|
|
1450
|
+
elif self.scenario == "tab_fact":
|
|
1451
|
+
instructions = (
|
|
1452
|
+
"Answer with only the classification of the last statement, either 'refuted' or 'entailed'."
|
|
1453
|
+
)
|
|
1454
|
+
elif self.scenario == "wikitq":
|
|
1455
|
+
instructions = (
|
|
1456
|
+
"Answer only the last question with a short answer. "
|
|
1457
|
+
"Avoid extra, unnecessary information in the answer."
|
|
1458
|
+
)
|
|
1459
|
+
else:
|
|
1460
|
+
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1461
|
+
|
|
1462
|
+
if self.suffix:
|
|
1463
|
+
return [
|
|
1464
|
+
replace(
|
|
1465
|
+
run_spec,
|
|
1466
|
+
adapter_spec=replace(
|
|
1467
|
+
run_spec.adapter_spec,
|
|
1468
|
+
global_suffix=f"{run_spec.adapter_spec.global_suffix}\n\n{instructions}",
|
|
1469
|
+
),
|
|
1470
|
+
),
|
|
1471
|
+
]
|
|
1472
|
+
|
|
1473
|
+
if run_spec.adapter_spec.instructions:
|
|
1474
|
+
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
1475
|
+
else:
|
|
1476
|
+
instructions = f"{instructions}\n"
|
|
1477
|
+
return [
|
|
1478
|
+
replace(
|
|
1479
|
+
run_spec,
|
|
1480
|
+
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
1481
|
+
),
|
|
1482
|
+
]
|
|
1483
|
+
|
|
1484
|
+
|
|
1383
1485
|
RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
1384
1486
|
InstructionsRunExpander,
|
|
1385
1487
|
PromptRunExpander,
|
|
@@ -1402,6 +1504,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1402
1504
|
NumOutputTokensRunExpander,
|
|
1403
1505
|
ChatMLRunExpander,
|
|
1404
1506
|
EvalSplitRunExpander,
|
|
1507
|
+
OutputFormatInstructions,
|
|
1405
1508
|
]
|
|
1406
1509
|
|
|
1407
1510
|
|
|
@@ -156,6 +156,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
156
156
|
increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
|
|
157
157
|
run_spec = singleton(increase_temperature_expander.expand(run_spec))
|
|
158
158
|
|
|
159
|
+
# MedLM-Large
|
|
160
|
+
if run_spec.adapter_spec.model == "google/medlm-large":
|
|
161
|
+
run_spec = singleton(StopRunExpander("none").expand(run_spec))
|
|
162
|
+
|
|
159
163
|
return run_spec
|
|
160
164
|
|
|
161
165
|
run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
|
|
2
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
3
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
4
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
5
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@run_spec_function("air_bench_2024")
|
|
9
|
+
def get_air_bench_2024_spec() -> RunSpec:
|
|
10
|
+
adapter_spec = AdapterSpec(
|
|
11
|
+
method=ADAPT_GENERATION,
|
|
12
|
+
global_prefix="",
|
|
13
|
+
global_suffix="",
|
|
14
|
+
instructions="",
|
|
15
|
+
input_prefix="",
|
|
16
|
+
input_suffix="",
|
|
17
|
+
output_prefix="",
|
|
18
|
+
output_suffix="",
|
|
19
|
+
instance_prefix="",
|
|
20
|
+
max_train_instances=0,
|
|
21
|
+
num_outputs=1,
|
|
22
|
+
max_tokens=512,
|
|
23
|
+
temperature=0.0,
|
|
24
|
+
stop_sequences=[],
|
|
25
|
+
)
|
|
26
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.air_bench_scenario.AIRBench2024Scenario")
|
|
27
|
+
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.air_bench_annotator.AIRBench2024Annotator")]
|
|
28
|
+
metric_specs = [
|
|
29
|
+
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024ScoreMetric"),
|
|
30
|
+
MetricSpec(class_name="helm.benchmark.metrics.air_bench_metrics.AIRBench2024BasicGenerationMetric"),
|
|
31
|
+
MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
|
|
32
|
+
]
|
|
33
|
+
return RunSpec(
|
|
34
|
+
name="air_bench_2024",
|
|
35
|
+
scenario_spec=scenario_spec,
|
|
36
|
+
adapter_spec=adapter_spec,
|
|
37
|
+
metric_specs=metric_specs,
|
|
38
|
+
annotators=annotator_specs,
|
|
39
|
+
groups=["air_bench_2024"],
|
|
40
|
+
)
|