crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
helm/benchmark/run.py
CHANGED
|
@@ -9,7 +9,7 @@ from helm.benchmark import model_metadata_registry
|
|
|
9
9
|
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
10
10
|
from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
11
11
|
from helm.common.general import ensure_directory_exists
|
|
12
|
-
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
12
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, setup_default_logging, hwarn
|
|
13
13
|
from helm.common.authentication import Authentication
|
|
14
14
|
from helm.common.object_spec import parse_object_spec, get_class_by_name
|
|
15
15
|
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
@@ -200,76 +200,9 @@ def validate_args(args):
|
|
|
200
200
|
|
|
201
201
|
|
|
202
202
|
@htrack(None)
|
|
203
|
-
def
|
|
204
|
-
parser = argparse.ArgumentParser()
|
|
205
|
-
add_service_args(parser)
|
|
206
|
-
parser.add_argument(
|
|
207
|
-
"-c",
|
|
208
|
-
"--conf-paths",
|
|
209
|
-
nargs="+",
|
|
210
|
-
help="Where to read RunSpecs to run from",
|
|
211
|
-
default=[],
|
|
212
|
-
)
|
|
213
|
-
parser.add_argument(
|
|
214
|
-
"--models-to-run",
|
|
215
|
-
nargs="+",
|
|
216
|
-
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
217
|
-
default=None,
|
|
218
|
-
)
|
|
219
|
-
parser.add_argument(
|
|
220
|
-
"--groups-to-run",
|
|
221
|
-
nargs="+",
|
|
222
|
-
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
223
|
-
default=None,
|
|
224
|
-
)
|
|
225
|
-
parser.add_argument(
|
|
226
|
-
"--exit-on-error",
|
|
227
|
-
action="store_true",
|
|
228
|
-
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
229
|
-
)
|
|
230
|
-
parser.add_argument(
|
|
231
|
-
"--skip-completed-runs",
|
|
232
|
-
action="store_true",
|
|
233
|
-
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
234
|
-
)
|
|
235
|
-
parser.add_argument(
|
|
236
|
-
"--priority",
|
|
237
|
-
type=int,
|
|
238
|
-
default=None,
|
|
239
|
-
help="Run RunSpecs with priority less than or equal to this number. "
|
|
240
|
-
"If a value for --priority is not specified, run on everything",
|
|
241
|
-
)
|
|
242
|
-
parser.add_argument(
|
|
243
|
-
"--run-specs",
|
|
244
|
-
nargs="*",
|
|
245
|
-
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
246
|
-
"Specifies run entries to run.",
|
|
247
|
-
default=[],
|
|
248
|
-
)
|
|
249
|
-
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
250
|
-
parser.add_argument(
|
|
251
|
-
"--enable-huggingface-models",
|
|
252
|
-
nargs="+",
|
|
253
|
-
default=[],
|
|
254
|
-
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
255
|
-
"Format: namespace/model_name[@revision]",
|
|
256
|
-
)
|
|
257
|
-
parser.add_argument(
|
|
258
|
-
"--enable-local-huggingface-models",
|
|
259
|
-
nargs="+",
|
|
260
|
-
default=[],
|
|
261
|
-
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
|
|
262
|
-
)
|
|
263
|
-
parser.add_argument(
|
|
264
|
-
"--runner-class-name",
|
|
265
|
-
type=str,
|
|
266
|
-
default=None,
|
|
267
|
-
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
268
|
-
)
|
|
269
|
-
add_run_args(parser)
|
|
270
|
-
args = parser.parse_args()
|
|
271
|
-
validate_args(args)
|
|
203
|
+
def helm_run(args):
|
|
272
204
|
|
|
205
|
+
validate_args(args)
|
|
273
206
|
register_builtin_configs_from_helm_package()
|
|
274
207
|
register_configs_from_directory(args.local_path)
|
|
275
208
|
|
|
@@ -358,13 +291,91 @@ def main():
|
|
|
358
291
|
)
|
|
359
292
|
|
|
360
293
|
if args.run_specs:
|
|
361
|
-
|
|
362
|
-
"
|
|
363
|
-
"Use --run-entries instead."
|
|
294
|
+
hwarn(
|
|
295
|
+
"The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
|
|
364
296
|
)
|
|
365
297
|
|
|
366
298
|
hlog("Done.")
|
|
367
299
|
|
|
368
300
|
|
|
301
|
+
# Separate parsing from starting HELM so we can setup logging
|
|
302
|
+
def main():
|
|
303
|
+
parser = argparse.ArgumentParser()
|
|
304
|
+
add_service_args(parser)
|
|
305
|
+
parser.add_argument(
|
|
306
|
+
"-c",
|
|
307
|
+
"--conf-paths",
|
|
308
|
+
nargs="+",
|
|
309
|
+
help="Where to read RunSpecs to run from",
|
|
310
|
+
default=[],
|
|
311
|
+
)
|
|
312
|
+
parser.add_argument(
|
|
313
|
+
"--models-to-run",
|
|
314
|
+
nargs="+",
|
|
315
|
+
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
|
|
316
|
+
default=None,
|
|
317
|
+
)
|
|
318
|
+
parser.add_argument(
|
|
319
|
+
"--groups-to-run",
|
|
320
|
+
nargs="+",
|
|
321
|
+
help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
|
|
322
|
+
default=None,
|
|
323
|
+
)
|
|
324
|
+
parser.add_argument(
|
|
325
|
+
"--exit-on-error",
|
|
326
|
+
action="store_true",
|
|
327
|
+
help="Fail and exit immediately if a particular RunSpec fails.",
|
|
328
|
+
)
|
|
329
|
+
parser.add_argument(
|
|
330
|
+
"--skip-completed-runs",
|
|
331
|
+
action="store_true",
|
|
332
|
+
help="Skip RunSpecs that have completed i.e. output files exists.",
|
|
333
|
+
)
|
|
334
|
+
parser.add_argument(
|
|
335
|
+
"--priority",
|
|
336
|
+
type=int,
|
|
337
|
+
default=None,
|
|
338
|
+
help="Run RunSpecs with priority less than or equal to this number. "
|
|
339
|
+
"If a value for --priority is not specified, run on everything",
|
|
340
|
+
)
|
|
341
|
+
parser.add_argument(
|
|
342
|
+
"--run-specs",
|
|
343
|
+
nargs="*",
|
|
344
|
+
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
345
|
+
"Specifies run entries to run.",
|
|
346
|
+
default=[],
|
|
347
|
+
)
|
|
348
|
+
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
349
|
+
parser.add_argument(
|
|
350
|
+
"--enable-huggingface-models",
|
|
351
|
+
nargs="+",
|
|
352
|
+
default=[],
|
|
353
|
+
help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
|
|
354
|
+
"Format: namespace/model_name[@revision]",
|
|
355
|
+
)
|
|
356
|
+
parser.add_argument(
|
|
357
|
+
"--enable-local-huggingface-models",
|
|
358
|
+
nargs="+",
|
|
359
|
+
default=[],
|
|
360
|
+
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
|
|
361
|
+
)
|
|
362
|
+
parser.add_argument(
|
|
363
|
+
"--runner-class-name",
|
|
364
|
+
type=str,
|
|
365
|
+
default=None,
|
|
366
|
+
help="Full class name of the Runner class to use. If unset, uses the default Runner.",
|
|
367
|
+
)
|
|
368
|
+
parser.add_argument(
|
|
369
|
+
"--log-config",
|
|
370
|
+
type=str,
|
|
371
|
+
default=None,
|
|
372
|
+
help="PATH to a YAML file to customize logging",
|
|
373
|
+
)
|
|
374
|
+
add_run_args(parser)
|
|
375
|
+
args = parser.parse_args()
|
|
376
|
+
setup_default_logging(args.log_config)
|
|
377
|
+
return helm_run(args)
|
|
378
|
+
|
|
379
|
+
|
|
369
380
|
if __name__ == "__main__":
|
|
370
381
|
main()
|
helm/benchmark/run_expander.py
CHANGED
|
@@ -21,7 +21,10 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
21
21
|
AUDIO_LANGUAGE_MODEL_TAG,
|
|
22
22
|
INSTRUCTION_FOLLOWING_MODEL_TAG,
|
|
23
23
|
)
|
|
24
|
-
from helm.benchmark.adaptation.adapters.adapter_factory import
|
|
24
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
25
|
+
ADAPT_GENERATION,
|
|
26
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
|
|
27
|
+
)
|
|
25
28
|
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
26
29
|
from helm.benchmark.run_spec import RunSpec
|
|
27
30
|
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
|
|
@@ -537,6 +540,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
537
540
|
"all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
|
|
538
541
|
"big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
|
|
539
542
|
"vhelm": [0, 1, 2, 4, 8],
|
|
543
|
+
"melt": [0, 1, 5],
|
|
540
544
|
}
|
|
541
545
|
|
|
542
546
|
|
|
@@ -1476,6 +1480,8 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1476
1480
|
instructions = "Answer with only a single letter."
|
|
1477
1481
|
elif self.scenario == "mcqa":
|
|
1478
1482
|
instructions = "Answer with only a single letter."
|
|
1483
|
+
elif self.scenario == "mcqa_no_period":
|
|
1484
|
+
instructions = "Answer with only a single letter. Do not include a period in your answer."
|
|
1479
1485
|
elif self.scenario == "mcqa_only_last_question":
|
|
1480
1486
|
instructions = "Answer only the last question with only a single letter."
|
|
1481
1487
|
else:
|
|
@@ -1521,6 +1527,11 @@ class OutputFormatInstructions(RunExpander):
|
|
|
1521
1527
|
)
|
|
1522
1528
|
else:
|
|
1523
1529
|
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1530
|
+
elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
|
|
1531
|
+
if self.scenario == "mmlu_pro" or self.scenario == "gpqa":
|
|
1532
|
+
instructions = 'In your response, replace "insert answer here" with the single uppercase letter corresponding to your answer.' # noqa: E501
|
|
1533
|
+
else:
|
|
1534
|
+
raise ValueError(f"Unknown scenario {self.scenario}")
|
|
1524
1535
|
|
|
1525
1536
|
if self.no_prefix:
|
|
1526
1537
|
if instructions:
|
|
@@ -143,12 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
|
143
143
|
):
|
|
144
144
|
run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
|
|
145
145
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
# TODO: find a better solution for this
|
|
147
|
+
# if model.name.startswith("openai/o"):
|
|
148
|
+
# # From https://platform.openai.com/docs/guides/reasoning,
|
|
149
|
+
# # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
|
|
150
|
+
# # experimenting with these models. As you become familiar with the number of reasoning tokens your
|
|
151
|
+
# # prompts require, you can adjust this buffer accordingly."
|
|
152
|
+
# run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
|
|
152
153
|
|
|
153
154
|
# IDEFICS special handling
|
|
154
155
|
if IDEFICS_MODEL_TAG in model.tags:
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Run specs for Arabic leaderboard
|
|
2
|
+
|
|
3
|
+
EXPERIMENTAL: Run specs here may have future reverse incompatible changes."""
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
6
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
|
|
7
|
+
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
|
|
8
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
9
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@run_spec_function("arabic_mmlu")
|
|
13
|
+
def get_arabic_mmlu_spec() -> RunSpec:
|
|
14
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
15
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario")
|
|
16
|
+
|
|
17
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
18
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
19
|
+
instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
|
|
20
|
+
input_noun="Question",
|
|
21
|
+
output_noun="Answer",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
return RunSpec(
|
|
25
|
+
name="arabic_mmlu",
|
|
26
|
+
scenario_spec=scenario_spec,
|
|
27
|
+
adapter_spec=adapter_spec,
|
|
28
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
29
|
+
groups=["arabic_mmlu"],
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@run_spec_function("alghafa")
|
|
34
|
+
def get_alghafa_spec(subset: str) -> RunSpec:
|
|
35
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
36
|
+
scenario_spec = ScenarioSpec(
|
|
37
|
+
class_name="helm.benchmark.scenarios.alghafa_scenario.AlGhafaScenario", args={"subset": subset}
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
41
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
42
|
+
instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
|
|
43
|
+
input_noun="Question",
|
|
44
|
+
output_noun="Answer",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return RunSpec(
|
|
48
|
+
name=f"alghafa:subset={subset}",
|
|
49
|
+
scenario_spec=scenario_spec,
|
|
50
|
+
adapter_spec=adapter_spec,
|
|
51
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
52
|
+
groups=["alghafa", f"alghafa_{subset}"],
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@run_spec_function("aratrust")
|
|
57
|
+
def get_aratrust_spec() -> RunSpec:
|
|
58
|
+
"""EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
|
|
59
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario")
|
|
60
|
+
|
|
61
|
+
adapter_spec = get_generation_adapter_spec(
|
|
62
|
+
instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
|
|
63
|
+
input_noun="Question",
|
|
64
|
+
output_noun="Answer",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return RunSpec(
|
|
68
|
+
name="aratrust",
|
|
69
|
+
scenario_spec=scenario_spec,
|
|
70
|
+
adapter_spec=adapter_spec,
|
|
71
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
72
|
+
groups=["aratrust"],
|
|
73
|
+
)
|
|
@@ -113,6 +113,18 @@ def _get_gpt4_critique_metric_specs(num_respondents: int, max_tokens: int) -> Li
|
|
|
113
113
|
]
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
def _get_gpt4_refusal_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
|
|
117
|
+
return [
|
|
118
|
+
MetricSpec(
|
|
119
|
+
class_name="helm.benchmark.metrics.gpt4_audio_refusal_metrics.GPT4AudioRefusalCritiqueMetric",
|
|
120
|
+
args={
|
|
121
|
+
"num_respondents": num_respondents,
|
|
122
|
+
"max_tokens": max_tokens,
|
|
123
|
+
},
|
|
124
|
+
)
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
|
|
116
128
|
########################################################################################################################
|
|
117
129
|
# RunSpecs
|
|
118
130
|
|
|
@@ -215,16 +227,20 @@ def get_mustard_audio_run_spec() -> RunSpec:
|
|
|
215
227
|
|
|
216
228
|
|
|
217
229
|
@run_spec_function("voice_jailbreak_attacks")
|
|
218
|
-
def get_voice_jailbreak_attacks_run_spec(subset: str) -> RunSpec:
|
|
230
|
+
def get_voice_jailbreak_attacks_run_spec(subset: str, num_respondents: int = 1) -> RunSpec:
|
|
219
231
|
scenario_spec = ScenarioSpec(
|
|
220
232
|
class_name="helm.benchmark.scenarios.audio_language.voice_jailbreak_attacks_scenario."
|
|
221
233
|
"VoiceJailbreakAttacksScenario",
|
|
222
234
|
args={"subset": subset},
|
|
223
235
|
)
|
|
224
|
-
adapter_spec = _get_generation_adapter_spec(
|
|
225
|
-
|
|
226
|
-
|
|
236
|
+
adapter_spec = _get_generation_adapter_spec(
|
|
237
|
+
instructions="Listen to the audio and respond according to its instructions.",
|
|
238
|
+
max_tokens=1024,
|
|
227
239
|
)
|
|
240
|
+
metric_specs: List[MetricSpec] = _get_gpt4_refusal_metric_specs(
|
|
241
|
+
num_respondents=num_respondents,
|
|
242
|
+
max_tokens=200,
|
|
243
|
+
) + get_generative_harms_metric_specs(include_basic_metrics=True, include_generative_harms_metrics=True)
|
|
228
244
|
|
|
229
245
|
run_spec_name: str = "voice_jailbreak_attacks"
|
|
230
246
|
return RunSpec(
|
|
@@ -258,19 +274,20 @@ def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
|
|
|
258
274
|
|
|
259
275
|
|
|
260
276
|
@run_spec_function("vocal_sound")
|
|
261
|
-
def get_vocal_sound_run_spec() -> RunSpec:
|
|
277
|
+
def get_vocal_sound_run_spec(sound: str) -> RunSpec:
|
|
262
278
|
scenario_spec = ScenarioSpec(
|
|
263
279
|
class_name="helm.benchmark.scenarios.audio_language.vocal_sound_scenario.VocalSoundScenario",
|
|
280
|
+
args={"sound": sound},
|
|
264
281
|
)
|
|
265
282
|
adapter_spec = _get_generation_adapter_spec(
|
|
266
283
|
instructions="Listen to the audio and classify the speaker behavior. Choose only from these options:"
|
|
267
284
|
'"Cough", "Laughter", "Sigh", "Sneeze", "Sniff", or "Throat clearing". Respond with just the behavior.',
|
|
268
285
|
max_tokens=5,
|
|
269
286
|
)
|
|
270
|
-
metric_specs = get_exact_match_metric_specs()
|
|
287
|
+
metric_specs = get_exact_match_metric_specs()
|
|
271
288
|
run_spec_name: str = "vocal_sound"
|
|
272
289
|
return RunSpec(
|
|
273
|
-
name=run_spec_name,
|
|
290
|
+
name=f"{run_spec_name}:sound={sound}",
|
|
274
291
|
scenario_spec=scenario_spec,
|
|
275
292
|
adapter_spec=adapter_spec,
|
|
276
293
|
metric_specs=metric_specs,
|
|
@@ -501,13 +518,20 @@ def get_air_bench_chat_run_spec(subject: str, num_respondents: int = 1) -> RunSp
|
|
|
501
518
|
)
|
|
502
519
|
+ _get_open_ended_generation_metric_specs()
|
|
503
520
|
)
|
|
521
|
+
|
|
504
522
|
run_spec_name: str = "air_bench_chat"
|
|
523
|
+
group_name: str = run_spec_name
|
|
524
|
+
if subject in ["mix", "speech"]:
|
|
525
|
+
group_name += "_reasoning"
|
|
526
|
+
elif subject in ["sound", "music"]:
|
|
527
|
+
group_name += "_knowledge"
|
|
528
|
+
|
|
505
529
|
return RunSpec(
|
|
506
530
|
name=f"{run_spec_name}:subject={subject}",
|
|
507
531
|
scenario_spec=scenario_spec,
|
|
508
532
|
adapter_spec=adapter_spec,
|
|
509
533
|
metric_specs=metric_specs,
|
|
510
|
-
groups=[
|
|
534
|
+
groups=[group_name],
|
|
511
535
|
)
|
|
512
536
|
|
|
513
537
|
|
|
@@ -611,3 +635,23 @@ def get_parade_run_spec(voice: str, subset: str) -> RunSpec:
|
|
|
611
635
|
metric_specs=metric_specs,
|
|
612
636
|
groups=[run_spec_name],
|
|
613
637
|
)
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
@run_spec_function("corebench")
|
|
641
|
+
def get_corebench_run_spec() -> RunSpec:
|
|
642
|
+
scenario_spec = ScenarioSpec(
|
|
643
|
+
class_name="helm.benchmark.scenarios.audio_language.corebench_scenario.COREBenchScenario",
|
|
644
|
+
)
|
|
645
|
+
adapter_spec = _get_generation_adapter_spec(
|
|
646
|
+
instructions="",
|
|
647
|
+
max_tokens=10,
|
|
648
|
+
)
|
|
649
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
650
|
+
run_spec_name: str = "corebench"
|
|
651
|
+
return RunSpec(
|
|
652
|
+
name=f"{run_spec_name}",
|
|
653
|
+
scenario_spec=scenario_spec,
|
|
654
|
+
adapter_spec=adapter_spec,
|
|
655
|
+
metric_specs=metric_specs,
|
|
656
|
+
groups=[run_spec_name],
|
|
657
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
|
|
2
|
+
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
|
|
3
|
+
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
|
|
4
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
5
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@run_spec_function("bluex")
|
|
9
|
+
def get_bluex_spec() -> RunSpec:
|
|
10
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEX_Scenario", args={})
|
|
11
|
+
|
|
12
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
13
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
14
|
+
instructions="""
|
|
15
|
+
Escolha a alternativa correta para as questões de vestibulares (responda apenas com a letra).
|
|
16
|
+
Exemplo de Pergunta com a resposta:
|
|
17
|
+
Em um romance narrado em primeira pessoa, o narrador participa dos acontecimentos da trama,
|
|
18
|
+
relatando suas próprias experiências e sentimentos. Qual alternativa apresenta essa característica?
|
|
19
|
+
|
|
20
|
+
(A) Narrador onisciente que conhece os pensamentos de todas as personagens.
|
|
21
|
+
(B) Narrador que descreve os fatos de forma imparcial, sem envolvimento emocional.
|
|
22
|
+
(C) Narrador-personagem que vivencia e relata os eventos da história.
|
|
23
|
+
(D) Narrador observador que apenas registra as ações visíveis.
|
|
24
|
+
(E) Narrador em segunda pessoa que se dirige constantemente ao leitor.
|
|
25
|
+
|
|
26
|
+
Resposta correta: C
|
|
27
|
+
|
|
28
|
+
A partir disso, responda:
|
|
29
|
+
""",
|
|
30
|
+
input_noun="Pergunta",
|
|
31
|
+
output_noun="Resposta",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return RunSpec(
|
|
35
|
+
name="bluex",
|
|
36
|
+
scenario_spec=scenario_spec,
|
|
37
|
+
adapter_spec=adapter_spec,
|
|
38
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
39
|
+
groups=["bluex"],
|
|
40
|
+
)
|
|
@@ -35,7 +35,6 @@ from helm.benchmark.metrics.common_metric_specs import (
|
|
|
35
35
|
get_f1_metric_specs,
|
|
36
36
|
get_generative_harms_metric_specs,
|
|
37
37
|
get_language_modeling_metric_specs,
|
|
38
|
-
get_numeracy_metric_specs,
|
|
39
38
|
get_open_ended_generation_metric_specs,
|
|
40
39
|
get_summarization_metric_specs,
|
|
41
40
|
get_basic_generation_metric_specs,
|
|
@@ -381,58 +380,6 @@ def get_raft_spec(subset: str) -> RunSpec:
|
|
|
381
380
|
)
|
|
382
381
|
|
|
383
382
|
|
|
384
|
-
@run_spec_function("numeracy")
|
|
385
|
-
def get_numeracy_spec(
|
|
386
|
-
relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
|
|
387
|
-
) -> RunSpec:
|
|
388
|
-
from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
|
|
389
|
-
|
|
390
|
-
run_solver_bool: bool = True if run_solver.lower() == "true" else False
|
|
391
|
-
del run_solver
|
|
392
|
-
random_seed = int(seed)
|
|
393
|
-
scenario_spec = ScenarioSpec(
|
|
394
|
-
class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
|
|
395
|
-
args={"seed": random_seed, "relation_type": relation_type, "mode": mode},
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
if mode in ["example", "standard"]:
|
|
399
|
-
# Test a model's ability to impute datapoints for a given (example or randomly sampled) relation.
|
|
400
|
-
adapter_args: Dict[str, Any] = {
|
|
401
|
-
"max_train_instances": 100,
|
|
402
|
-
"max_eval_instances": 100,
|
|
403
|
-
"dim": RELTYPE_INFO[relation_type].num_variables + 1,
|
|
404
|
-
}
|
|
405
|
-
elif mode == "function":
|
|
406
|
-
# Test a model's ability to impute datapoints for randomly sampled relations
|
|
407
|
-
# (resampled for each evaluation point).
|
|
408
|
-
adapter_args = {
|
|
409
|
-
"instructions": "",
|
|
410
|
-
"max_train_instances": 0, # Turn off general version of `function` mode because it doesn't cleanly
|
|
411
|
-
# capture a higher-order version of this task / is a little convoluted
|
|
412
|
-
# for models, currently.
|
|
413
|
-
# (In the general version, the model sees other relations of the same class,
|
|
414
|
-
# and needs to impute a datapoint for the last one. Presumably, inferring
|
|
415
|
-
# the class - eg. the degree of the relation - would help.)
|
|
416
|
-
"max_eval_instances": 1000,
|
|
417
|
-
"dim": RELTYPE_INFO[relation_type].num_variables + 1,
|
|
418
|
-
"instance_prefix": "\n\n",
|
|
419
|
-
}
|
|
420
|
-
else:
|
|
421
|
-
raise ValueError(f"Invalid mode: {mode}")
|
|
422
|
-
|
|
423
|
-
adapter_spec = get_numeracy_adapter_spec(**adapter_args) # Construct the AdapterSpec using a helper function.
|
|
424
|
-
# `get_numeracy_adapter_spec` is defined in numeracy_scenario.py
|
|
425
|
-
# because it is used within the scenario to construct the instances themselves.
|
|
426
|
-
|
|
427
|
-
return RunSpec(
|
|
428
|
-
name=f"numeracy:relation_type={relation_type},mode={mode}",
|
|
429
|
-
scenario_spec=scenario_spec,
|
|
430
|
-
adapter_spec=adapter_spec,
|
|
431
|
-
metric_specs=get_numeracy_metric_specs(run_solver_bool),
|
|
432
|
-
groups=["numeracy"],
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
|
|
436
383
|
@run_spec_function("boolq")
|
|
437
384
|
def get_boolq_spec(only_contrast=False) -> RunSpec:
|
|
438
385
|
scenario_spec = ScenarioSpec(
|