crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
import cattrs
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
8
8
|
from helm.common.object_spec import ObjectSpec
|
|
9
9
|
from helm.benchmark.model_metadata_registry import (
|
|
10
10
|
ModelMetadata,
|
|
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
|
|
|
104
104
|
try:
|
|
105
105
|
model_metadata = get_model_metadata(model_name)
|
|
106
106
|
except ValueError:
|
|
107
|
-
|
|
108
|
-
f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
|
|
109
|
-
)
|
|
107
|
+
hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
|
|
110
108
|
model_metadata = get_unknown_model_metadata(model_name)
|
|
111
109
|
register_model_metadata(model_metadata)
|
|
112
110
|
deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
|
|
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
|
|
|
130
128
|
raise ValueError(f"Model deployment {name} not found")
|
|
131
129
|
deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
|
|
132
130
|
if deployment.deprecated and warn_deprecated:
|
|
133
|
-
|
|
131
|
+
hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
|
|
134
132
|
return deployment
|
|
135
133
|
|
|
136
134
|
|
|
@@ -182,7 +180,7 @@ def get_default_model_deployment_for_model(
|
|
|
182
180
|
deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
|
|
183
181
|
if deployment.deprecated and ignore_deprecated:
|
|
184
182
|
if warn_arg_deprecated:
|
|
185
|
-
|
|
183
|
+
hwarn(f"Model deployment {model_name} is deprecated")
|
|
186
184
|
return None
|
|
187
185
|
return deployment.name
|
|
188
186
|
|
|
@@ -193,7 +191,7 @@ def get_default_model_deployment_for_model(
|
|
|
193
191
|
if len(available_deployments) > 0:
|
|
194
192
|
available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
|
|
195
193
|
if warn_arg_deprecated:
|
|
196
|
-
|
|
194
|
+
hwarn("Model name is deprecated. Please use the model deployment name instead.")
|
|
197
195
|
hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
|
|
198
196
|
|
|
199
197
|
# Additionally, if there is a non-deprecated deployment, use it.
|
|
@@ -210,7 +208,7 @@ def get_default_model_deployment_for_model(
|
|
|
210
208
|
else:
|
|
211
209
|
chosen_deployment = available_deployments[0]
|
|
212
210
|
if warn_arg_deprecated:
|
|
213
|
-
|
|
211
|
+
hwarn(f"All model deployments for model {model_name} are deprecated.")
|
|
214
212
|
if warn_arg_deprecated:
|
|
215
213
|
hlog(
|
|
216
214
|
f"Choosing {chosen_deployment.name} (the first one) as "
|
|
@@ -4,7 +4,7 @@ import dacite
|
|
|
4
4
|
import importlib_resources as resources
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from helm.common.hierarchical_logger import htrack, hlog
|
|
7
|
+
from helm.common.hierarchical_logger import htrack, hlog, hwarn
|
|
8
8
|
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
9
9
|
from helm.benchmark.presentation.schema import Schema
|
|
10
10
|
|
|
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
|
|
|
71
71
|
for point in contamination.points:
|
|
72
72
|
for model in point.models:
|
|
73
73
|
if model not in MODEL_NAME_TO_MODEL_METADATA:
|
|
74
|
-
|
|
74
|
+
hwarn(f"model {model} not defined in schema")
|
|
75
75
|
for group in point.groups:
|
|
76
76
|
if group not in schema.name_to_run_group:
|
|
77
|
-
|
|
77
|
+
hwarn(f"group {group} not defined in schema")
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
def read_contamination():
|
|
@@ -11,7 +11,7 @@ import numpy as np
|
|
|
11
11
|
from scipy.stats import pearsonr
|
|
12
12
|
|
|
13
13
|
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
|
|
14
|
-
from helm.common.hierarchical_logger import hlog
|
|
14
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
15
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
16
16
|
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
17
17
|
|
|
@@ -600,17 +600,7 @@ class Plotter:
|
|
|
600
600
|
self.create_constrast_set_plots()
|
|
601
601
|
|
|
602
602
|
|
|
603
|
-
def
|
|
604
|
-
"""
|
|
605
|
-
This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
|
|
606
|
-
It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
|
|
607
|
-
the top-level command `helm-create-plots`.
|
|
608
|
-
"""
|
|
609
|
-
parser = argparse.ArgumentParser()
|
|
610
|
-
parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
|
|
611
|
-
parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
|
|
612
|
-
parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
|
|
613
|
-
args = parser.parse_args()
|
|
603
|
+
def create_plots(args):
|
|
614
604
|
register_builtin_configs_from_helm_package()
|
|
615
605
|
base_path = os.path.join(args.output_path, "runs", args.suite)
|
|
616
606
|
if not os.path.exists(os.path.join(base_path, "groups")):
|
|
@@ -621,5 +611,36 @@ def main():
|
|
|
621
611
|
plotter.create_all_plots()
|
|
622
612
|
|
|
623
613
|
|
|
614
|
+
def main():
|
|
615
|
+
"""
|
|
616
|
+
This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
|
|
617
|
+
It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
|
|
618
|
+
the top-level command `helm-create-plots`.
|
|
619
|
+
"""
|
|
620
|
+
parser = argparse.ArgumentParser()
|
|
621
|
+
parser.add_argument(
|
|
622
|
+
"-o",
|
|
623
|
+
"--output-path",
|
|
624
|
+
type=str,
|
|
625
|
+
help="Path to benchmarking output",
|
|
626
|
+
default="benchmark_output",
|
|
627
|
+
)
|
|
628
|
+
parser.add_argument(
|
|
629
|
+
"--suite",
|
|
630
|
+
type=str,
|
|
631
|
+
help="Name of the suite that we are plotting",
|
|
632
|
+
required=True,
|
|
633
|
+
)
|
|
634
|
+
parser.add_argument(
|
|
635
|
+
"--plot-format",
|
|
636
|
+
help="Format for saving plots",
|
|
637
|
+
default="png",
|
|
638
|
+
choices=["png", "pdf"],
|
|
639
|
+
)
|
|
640
|
+
args = parser.parse_args()
|
|
641
|
+
setup_default_logging()
|
|
642
|
+
create_plots(args)
|
|
643
|
+
|
|
644
|
+
|
|
624
645
|
if __name__ == "__main__":
|
|
625
646
|
main()
|
|
@@ -59,6 +59,9 @@ class DisplayPrediction:
|
|
|
59
59
|
|
|
60
60
|
annotations: Optional[Dict[str, Any]]
|
|
61
61
|
|
|
62
|
+
thinking_text: Optional[str]
|
|
63
|
+
"""Thinking text from thinking models."""
|
|
64
|
+
|
|
62
65
|
|
|
63
66
|
@dataclass(frozen=True)
|
|
64
67
|
class DisplayRequest:
|
|
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
266
269
|
request_state.instance
|
|
267
270
|
)
|
|
268
271
|
|
|
272
|
+
if request_state.result.completions[0].multimodal_content:
|
|
273
|
+
additional_prediction: str = request_state.result.completions[0].multimodal_content.text
|
|
274
|
+
if additional_prediction:
|
|
275
|
+
predicted_text = f"{additional_prediction} {predicted_text}"
|
|
276
|
+
|
|
269
277
|
# Process images and include if they exist
|
|
270
278
|
images: List[str] = [
|
|
271
279
|
encode_base64(image_location)
|
|
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
273
281
|
if os.path.exists(image_location)
|
|
274
282
|
]
|
|
275
283
|
|
|
284
|
+
thinking_text: Optional[str] = (
|
|
285
|
+
request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
|
|
286
|
+
)
|
|
287
|
+
|
|
276
288
|
predictions.append(
|
|
277
289
|
DisplayPrediction(
|
|
278
290
|
instance_id=request_state.instance.id,
|
|
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
285
297
|
reference_index=request_state.reference_index,
|
|
286
298
|
stats=trial_stats,
|
|
287
299
|
annotations=request_state.annotations,
|
|
300
|
+
thinking_text=thinking_text,
|
|
288
301
|
)
|
|
289
302
|
)
|
|
290
303
|
requests.append(
|
|
@@ -11,6 +11,7 @@ import importlib_resources as resources
|
|
|
11
11
|
from helm.common.general import hlog
|
|
12
12
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
13
|
from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
|
|
14
|
+
from helm.common.hierarchical_logger import hwarn
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
# TODO: change to `helm.benchmark.config`
|
|
@@ -281,5 +282,5 @@ def read_schema(schema_path: str) -> Schema:
|
|
|
281
282
|
raw = yaml.safe_load(f)
|
|
282
283
|
schema = dacite.from_dict(Schema, raw)
|
|
283
284
|
if schema.adapter:
|
|
284
|
-
|
|
285
|
+
hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
|
|
285
286
|
return dataclasses.replace(schema, adapter=get_adapter_fields())
|
|
@@ -30,7 +30,7 @@ from helm.common.general import (
|
|
|
30
30
|
unique_simplification,
|
|
31
31
|
)
|
|
32
32
|
from helm.common.codec import from_json
|
|
33
|
-
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
33
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
|
|
34
34
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
35
35
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
36
36
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
@@ -102,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
|
|
|
102
102
|
# This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
|
|
103
103
|
# as the main metric but multiple_choice_separate_original only generates exact_match
|
|
104
104
|
if matcher.name == "quasi_exact_match":
|
|
105
|
-
|
|
105
|
+
hwarn("No quasi_exact_match metric found, looking for exact_match instead")
|
|
106
106
|
matcher = replace(matcher, name="exact_match")
|
|
107
107
|
matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
|
|
108
108
|
if len(matching_stats) == 0:
|
|
@@ -406,8 +406,8 @@ class Summarizer:
|
|
|
406
406
|
included = False
|
|
407
407
|
for run_group_name in run.run_spec.groups: # go through the groups of the run to determine visibility
|
|
408
408
|
if run_group_name not in self.schema.name_to_run_group:
|
|
409
|
-
|
|
410
|
-
f"
|
|
409
|
+
hwarn(
|
|
410
|
+
f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
|
|
411
411
|
f"but undefined in {self.schema_path}, skipping"
|
|
412
412
|
)
|
|
413
413
|
continue
|
|
@@ -440,14 +440,14 @@ class Summarizer:
|
|
|
440
440
|
run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
|
|
441
441
|
stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
|
|
442
442
|
if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
|
|
443
|
-
|
|
443
|
+
hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
|
|
444
444
|
continue
|
|
445
445
|
run_path: str = os.path.join(run_suite_path, run_dir_name)
|
|
446
446
|
run = self.read_run(run_path)
|
|
447
447
|
self.runs.append(run)
|
|
448
448
|
if run.run_spec.name in self.runs_to_run_suites:
|
|
449
|
-
|
|
450
|
-
f"
|
|
449
|
+
hwarn(
|
|
450
|
+
f"Run entry {run.run_spec.name} is present in two different Run Suites. "
|
|
451
451
|
f"Defaulting to the latest assigned suite: {suite}"
|
|
452
452
|
)
|
|
453
453
|
self.runs_to_run_suites[run.run_spec.name] = suite
|
|
@@ -544,8 +544,8 @@ class Summarizer:
|
|
|
544
544
|
|
|
545
545
|
for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
|
|
546
546
|
if metric_name not in defined_metric_names:
|
|
547
|
-
|
|
548
|
-
f"
|
|
547
|
+
hwarn(
|
|
548
|
+
f"metric name {metric_name} undefined in {self.schema_path} "
|
|
549
549
|
f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
|
|
550
550
|
)
|
|
551
551
|
|
|
@@ -738,8 +738,8 @@ class Summarizer:
|
|
|
738
738
|
if stat is None:
|
|
739
739
|
# Print out near misses to provide a more informative warning
|
|
740
740
|
near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
|
|
741
|
-
|
|
742
|
-
f"
|
|
741
|
+
hwarn(
|
|
742
|
+
f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
|
|
743
743
|
f"{len(near_misses)} near misses matching just the name"
|
|
744
744
|
)
|
|
745
745
|
if len(near_misses) > 0:
|
|
@@ -810,7 +810,7 @@ class Summarizer:
|
|
|
810
810
|
# Create header (cells to display) and the list of metric name filters
|
|
811
811
|
# (to pull out information later).
|
|
812
812
|
if not columns or not adapter_to_runs:
|
|
813
|
-
|
|
813
|
+
hwarn(f"table {title}, has no rows or columns, leaving empty")
|
|
814
814
|
return Table("empty", [], [])
|
|
815
815
|
|
|
816
816
|
header: List[HeaderCell] = []
|
|
@@ -831,7 +831,7 @@ class Summarizer:
|
|
|
831
831
|
matcher = replace(matcher, sub_split=sub_split)
|
|
832
832
|
header_field = self.schema.name_to_metric.get(matcher.name)
|
|
833
833
|
if header_field is None:
|
|
834
|
-
|
|
834
|
+
hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
|
|
835
835
|
continue
|
|
836
836
|
metadata = {
|
|
837
837
|
"metric": header_field.get_short_display_name(),
|
|
@@ -959,8 +959,8 @@ class Summarizer:
|
|
|
959
959
|
all_run_spec_names = []
|
|
960
960
|
for adapter_spec, runs in adapter_to_runs.items():
|
|
961
961
|
if len(runs) > 1:
|
|
962
|
-
|
|
963
|
-
f"
|
|
962
|
+
hwarn(
|
|
963
|
+
f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
|
|
964
964
|
f" {[run.run_spec.name for run in runs]}"
|
|
965
965
|
)
|
|
966
966
|
for run in runs:
|
|
@@ -1232,10 +1232,57 @@ class Summarizer:
|
|
|
1232
1232
|
|
|
1233
1233
|
|
|
1234
1234
|
@htrack("summarize")
|
|
1235
|
+
def summarize(args):
|
|
1236
|
+
release: Optional[str] = None
|
|
1237
|
+
suites: Optional[str] = None
|
|
1238
|
+
suite: Optional[str] = None
|
|
1239
|
+
if args.suite and (args.release or args.suites):
|
|
1240
|
+
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
|
|
1241
|
+
elif args.suite:
|
|
1242
|
+
# Comment this out while we have a trial period for the `release` method.
|
|
1243
|
+
# hlog(
|
|
1244
|
+
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
|
|
1245
|
+
# "where --release specifies the name of a release and --suites specifies several run suites "
|
|
1246
|
+
# "to be included in that release."
|
|
1247
|
+
# )
|
|
1248
|
+
suite = args.suite
|
|
1249
|
+
elif args.release or args.suites:
|
|
1250
|
+
if not args.release or not args.suites:
|
|
1251
|
+
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
|
|
1252
|
+
release = args.release
|
|
1253
|
+
suites = args.suites
|
|
1254
|
+
else:
|
|
1255
|
+
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1256
|
+
|
|
1257
|
+
schema_path = args.schema_path if args.schema_path else get_default_schema_path()
|
|
1258
|
+
|
|
1259
|
+
register_builtin_configs_from_helm_package()
|
|
1260
|
+
register_configs_from_directory(args.local_path)
|
|
1261
|
+
|
|
1262
|
+
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1263
|
+
summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
|
|
1264
|
+
summarizer = summarizer_cls(
|
|
1265
|
+
release=release,
|
|
1266
|
+
suites=suites,
|
|
1267
|
+
suite=suite,
|
|
1268
|
+
schema_path=schema_path,
|
|
1269
|
+
output_path=args.output_path,
|
|
1270
|
+
verbose=args.debug,
|
|
1271
|
+
num_threads=args.num_threads,
|
|
1272
|
+
allow_unknown_models=args.allow_unknown_models,
|
|
1273
|
+
)
|
|
1274
|
+
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
|
|
1275
|
+
hlog("Done.")
|
|
1276
|
+
|
|
1277
|
+
|
|
1235
1278
|
def main():
|
|
1236
1279
|
parser = argparse.ArgumentParser()
|
|
1237
1280
|
parser.add_argument(
|
|
1238
|
-
"-o",
|
|
1281
|
+
"-o",
|
|
1282
|
+
"--output-path",
|
|
1283
|
+
type=str,
|
|
1284
|
+
help="Where the benchmarking output lives",
|
|
1285
|
+
default="benchmark_output",
|
|
1239
1286
|
)
|
|
1240
1287
|
parser.add_argument(
|
|
1241
1288
|
"--schema-path",
|
|
@@ -1253,9 +1300,18 @@ def main():
|
|
|
1253
1300
|
help="Experimental: Name of the release this summarization should go under.",
|
|
1254
1301
|
)
|
|
1255
1302
|
parser.add_argument(
|
|
1256
|
-
"--suites",
|
|
1303
|
+
"--suites",
|
|
1304
|
+
type=str,
|
|
1305
|
+
nargs="+",
|
|
1306
|
+
help="Experimental: List of suites to summarize for this this release.",
|
|
1307
|
+
)
|
|
1308
|
+
parser.add_argument(
|
|
1309
|
+
"-n",
|
|
1310
|
+
"--num-threads",
|
|
1311
|
+
type=int,
|
|
1312
|
+
help="Max number of threads used to summarize",
|
|
1313
|
+
default=8,
|
|
1257
1314
|
)
|
|
1258
|
-
parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
|
|
1259
1315
|
parser.add_argument(
|
|
1260
1316
|
"--debug",
|
|
1261
1317
|
action="store_true",
|
|
@@ -1285,47 +1341,8 @@ def main():
|
|
|
1285
1341
|
help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
|
|
1286
1342
|
)
|
|
1287
1343
|
args = parser.parse_args()
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
suites: Optional[str] = None
|
|
1291
|
-
suite: Optional[str] = None
|
|
1292
|
-
if args.suite and (args.release or args.suites):
|
|
1293
|
-
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
|
|
1294
|
-
elif args.suite:
|
|
1295
|
-
# Comment this out while we have a trial period for the `release` method.
|
|
1296
|
-
# hlog(
|
|
1297
|
-
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
|
|
1298
|
-
# "where --release specifies the name of a release and --suites specifies several run suites "
|
|
1299
|
-
# "to be included in that release."
|
|
1300
|
-
# )
|
|
1301
|
-
suite = args.suite
|
|
1302
|
-
elif args.release or args.suites:
|
|
1303
|
-
if not args.release or not args.suites:
|
|
1304
|
-
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
|
|
1305
|
-
release = args.release
|
|
1306
|
-
suites = args.suites
|
|
1307
|
-
else:
|
|
1308
|
-
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1309
|
-
|
|
1310
|
-
schema_path = args.schema_path if args.schema_path else get_default_schema_path()
|
|
1311
|
-
|
|
1312
|
-
register_builtin_configs_from_helm_package()
|
|
1313
|
-
register_configs_from_directory(args.local_path)
|
|
1314
|
-
|
|
1315
|
-
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1316
|
-
summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
|
|
1317
|
-
summarizer = summarizer_cls(
|
|
1318
|
-
release=release,
|
|
1319
|
-
suites=suites,
|
|
1320
|
-
suite=suite,
|
|
1321
|
-
schema_path=schema_path,
|
|
1322
|
-
output_path=args.output_path,
|
|
1323
|
-
verbose=args.debug,
|
|
1324
|
-
num_threads=args.num_threads,
|
|
1325
|
-
allow_unknown_models=args.allow_unknown_models,
|
|
1326
|
-
)
|
|
1327
|
-
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
|
|
1328
|
-
hlog("Done.")
|
|
1344
|
+
setup_default_logging()
|
|
1345
|
+
summarize(args)
|
|
1329
1346
|
|
|
1330
1347
|
|
|
1331
1348
|
if __name__ == "__main__":
|
helm/benchmark/reeval_run.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import List
|
|
|
6
6
|
from helm.benchmark import model_metadata_registry
|
|
7
7
|
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
8
8
|
from helm.common.general import ensure_directory_exists
|
|
9
|
-
from helm.common.hierarchical_logger import hlog, htrack
|
|
9
|
+
from helm.common.hierarchical_logger import hlog, htrack, hwarn
|
|
10
10
|
from helm.common.authentication import Authentication
|
|
11
11
|
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
12
12
|
|
|
@@ -191,9 +191,8 @@ def main():
|
|
|
191
191
|
)
|
|
192
192
|
|
|
193
193
|
if args.run_specs:
|
|
194
|
-
|
|
195
|
-
"
|
|
196
|
-
"Use --run-entries instead."
|
|
194
|
+
hwarn(
|
|
195
|
+
"The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
|
|
197
196
|
)
|
|
198
197
|
|
|
199
198
|
hlog("Done.")
|
helm/benchmark/reeval_runner.py
CHANGED
|
@@ -12,7 +12,7 @@ from datasets import load_dataset
|
|
|
12
12
|
|
|
13
13
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
14
14
|
from helm.common.general import ensure_directory_exists, write, asdict_without_nones
|
|
15
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
15
|
+
from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
|
|
16
16
|
from helm.common.cache import cache_stats
|
|
17
17
|
from helm.benchmark.scenarios.scenario import (
|
|
18
18
|
Scenario,
|
|
@@ -193,7 +193,7 @@ class REEvalRunner(Runner):
|
|
|
193
193
|
difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
|
|
194
194
|
prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
|
|
195
195
|
except ValueError:
|
|
196
|
-
|
|
196
|
+
hwarn(f"no available difficulty for {split_name}, skipping")
|
|
197
197
|
return
|
|
198
198
|
|
|
199
199
|
unasked_request_states: List[RequestState] = []
|
|
@@ -320,7 +320,7 @@ class REEvalRunner(Runner):
|
|
|
320
320
|
metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
|
|
321
321
|
for metric_name, count in metric_counts.items():
|
|
322
322
|
if count > 1:
|
|
323
|
-
|
|
323
|
+
hwarn(f"duplicate metric name {metric_name}")
|
|
324
324
|
|
|
325
325
|
# Print out the number of stats
|
|
326
326
|
hlog(f"Generated {len(stats)} stats.")
|
helm/benchmark/run.py
CHANGED
|
@@ -9,7 +9,7 @@ from helm.benchmark import model_metadata_registry
|
|
|
9
9
|
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
10
10
|
from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
11
11
|
from helm.common.general import ensure_directory_exists
|
|
12
|
-
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
12
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, setup_default_logging, hwarn
|
|
13
13
|
from helm.common.authentication import Authentication
|
|
14
14
|
from helm.common.object_spec import parse_object_spec, get_class_by_name
|
|
15
15
|
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
@@ -200,76 +200,9 @@ def validate_args(args):
|
|
|
200
200
|
|
|
201
201
|
|
|
202
202
|
@htrack(None)
|
|
203
|
-
def
|
|
204
|
-
parser = argparse.ArgumentParser()
|
|
205
|
-
add_service_args(parser)
|
|
206
|
-
parser.add_argument(
|
|
207
|
-
"-c",
|
|
208
|
-
"--conf-paths",
|
|
209
|
-
nargs="+",
|
|
210
|
-
help="Where to read RunSpecs to run from",
|
|
211
|
-
default=[],
|
|
212
|
-
)
|
|
213
|
-
parser.add_argument(
|
|
214
|
-
"--models-to-run",
|
|
215
|
-
nargs="+",
|
|
216
|
-
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
217
|
-
default=None,
|
|
218
|
-
)
|
|
219
|
-
parser.add_argument(
|
|
220
|
-
"--groups-to-run",
|
|
221
|
-
nargs="+",
|
|
222
|
-
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
223
|
-
default=None,
|
|
224
|
-
)
|
|
225
|
-
parser.add_argument(
|
|
226
|
-
"--exit-on-error",
|
|
227
|
-
action="store_true",
|
|
228
|
-
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
229
|
-
)
|
|
230
|
-
parser.add_argument(
|
|
231
|
-
"--skip-completed-runs",
|
|
232
|
-
action="store_true",
|
|
233
|
-
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
234
|
-
)
|
|
235
|
-
parser.add_argument(
|
|
236
|
-
"--priority",
|
|
237
|
-
type=int,
|
|
238
|
-
default=None,
|
|
239
|
-
help="Run RunSpecs with priority less than or equal to this number. "
|
|
240
|
-
"If a value for --priority is not specified, run on everything",
|
|
241
|
-
)
|
|
242
|
-
parser.add_argument(
|
|
243
|
-
"--run-specs",
|
|
244
|
-
nargs="*",
|
|
245
|
-
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
246
|
-
"Specifies run entries to run.",
|
|
247
|
-
default=[],
|
|
248
|
-
)
|
|
249
|
-
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
250
|
-
parser.add_argument(
|
|
251
|
-
"--enable-huggingface-models",
|
|
252
|
-
nargs="+",
|
|
253
|
-
default=[],
|
|
254
|
-
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
255
|
-
"Format: namespace/model_name[@revision]",
|
|
256
|
-
)
|
|
257
|
-
parser.add_argument(
|
|
258
|
-
"--enable-local-huggingface-models",
|
|
259
|
-
nargs="+",
|
|
260
|
-
default=[],
|
|
261
|
-
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
|
|
262
|
-
)
|
|
263
|
-
parser.add_argument(
|
|
264
|
-
"--runner-class-name",
|
|
265
|
-
type=str,
|
|
266
|
-
default=None,
|
|
267
|
-
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
268
|
-
)
|
|
269
|
-
add_run_args(parser)
|
|
270
|
-
args = parser.parse_args()
|
|
271
|
-
validate_args(args)
|
|
203
|
+
def helm_run(args):
|
|
272
204
|
|
|
205
|
+
validate_args(args)
|
|
273
206
|
register_builtin_configs_from_helm_package()
|
|
274
207
|
register_configs_from_directory(args.local_path)
|
|
275
208
|
|
|
@@ -358,13 +291,85 @@ def main():
|
|
|
358
291
|
)
|
|
359
292
|
|
|
360
293
|
if args.run_specs:
|
|
361
|
-
|
|
362
|
-
"
|
|
363
|
-
"Use --run-entries instead."
|
|
294
|
+
hwarn(
|
|
295
|
+
"The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
|
|
364
296
|
)
|
|
365
297
|
|
|
366
298
|
hlog("Done.")
|
|
367
299
|
|
|
368
300
|
|
|
301
|
+
# Separate parsing from starting HELM so we can setup logging
|
|
302
|
+
def main():
|
|
303
|
+
parser = argparse.ArgumentParser()
|
|
304
|
+
add_service_args(parser)
|
|
305
|
+
parser.add_argument(
|
|
306
|
+
"-c",
|
|
307
|
+
"--conf-paths",
|
|
308
|
+
nargs="+",
|
|
309
|
+
help="Where to read RunSpecs to run from",
|
|
310
|
+
default=[],
|
|
311
|
+
)
|
|
312
|
+
parser.add_argument(
|
|
313
|
+
"--models-to-run",
|
|
314
|
+
nargs="+",
|
|
315
|
+
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
316
|
+
default=None,
|
|
317
|
+
)
|
|
318
|
+
parser.add_argument(
|
|
319
|
+
"--groups-to-run",
|
|
320
|
+
nargs="+",
|
|
321
|
+
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
322
|
+
default=None,
|
|
323
|
+
)
|
|
324
|
+
parser.add_argument(
|
|
325
|
+
"--exit-on-error",
|
|
326
|
+
action="store_true",
|
|
327
|
+
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
328
|
+
)
|
|
329
|
+
parser.add_argument(
|
|
330
|
+
"--skip-completed-runs",
|
|
331
|
+
action="store_true",
|
|
332
|
+
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
333
|
+
)
|
|
334
|
+
parser.add_argument(
|
|
335
|
+
"--priority",
|
|
336
|
+
type=int,
|
|
337
|
+
default=None,
|
|
338
|
+
help="Run RunSpecs with priority less than or equal to this number. "
|
|
339
|
+
"If a value for --priority is not specified, run on everything",
|
|
340
|
+
)
|
|
341
|
+
parser.add_argument(
|
|
342
|
+
"--run-specs",
|
|
343
|
+
nargs="*",
|
|
344
|
+
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
345
|
+
"Specifies run entries to run.",
|
|
346
|
+
default=[],
|
|
347
|
+
)
|
|
348
|
+
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
349
|
+
parser.add_argument(
|
|
350
|
+
"--enable-huggingface-models",
|
|
351
|
+
nargs="+",
|
|
352
|
+
default=[],
|
|
353
|
+
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
354
|
+
"Format: namespace/model_name[@revision]",
|
|
355
|
+
)
|
|
356
|
+
parser.add_argument(
|
|
357
|
+
"--enable-local-huggingface-models",
|
|
358
|
+
nargs="+",
|
|
359
|
+
default=[],
|
|
360
|
+
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
|
|
361
|
+
)
|
|
362
|
+
parser.add_argument(
|
|
363
|
+
"--runner-class-name",
|
|
364
|
+
type=str,
|
|
365
|
+
default=None,
|
|
366
|
+
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
367
|
+
)
|
|
368
|
+
add_run_args(parser)
|
|
369
|
+
args = parser.parse_args()
|
|
370
|
+
setup_default_logging()
|
|
371
|
+
return helm_run(args)
|
|
372
|
+
|
|
373
|
+
|
|
369
374
|
if __name__ == "__main__":
|
|
370
375
|
main()
|