crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
import cattrs
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
8
8
|
from helm.common.object_spec import ObjectSpec
|
|
9
9
|
from helm.benchmark.model_metadata_registry import (
|
|
10
10
|
ModelMetadata,
|
|
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
|
|
|
104
104
|
try:
|
|
105
105
|
model_metadata = get_model_metadata(model_name)
|
|
106
106
|
except ValueError:
|
|
107
|
-
|
|
108
|
-
f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
|
|
109
|
-
)
|
|
107
|
+
hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
|
|
110
108
|
model_metadata = get_unknown_model_metadata(model_name)
|
|
111
109
|
register_model_metadata(model_metadata)
|
|
112
110
|
deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
|
|
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
|
|
|
130
128
|
raise ValueError(f"Model deployment {name} not found")
|
|
131
129
|
deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
|
|
132
130
|
if deployment.deprecated and warn_deprecated:
|
|
133
|
-
|
|
131
|
+
hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
|
|
134
132
|
return deployment
|
|
135
133
|
|
|
136
134
|
|
|
@@ -159,12 +157,11 @@ def get_default_model_deployment_for_model(
|
|
|
159
157
|
Example: "meta/llama-7b" => "together/llama-7b"
|
|
160
158
|
|
|
161
159
|
The process to find a model deployment name is as follows:
|
|
162
|
-
1. If there is
|
|
163
|
-
2. If there
|
|
164
|
-
3. If there are no deployments for the model, returns None.
|
|
160
|
+
1. If there is at least one deployment for the model, use the last one that is available.
|
|
161
|
+
2. If there are no deployments for the model, returns None.
|
|
165
162
|
|
|
166
163
|
This function will also try to find a model deployment name that is not deprecated.
|
|
167
|
-
If there are no non-deprecated deployments, it will return the
|
|
164
|
+
If there are no non-deprecated deployments, it will return the last deployment (even if it's deprecated).
|
|
168
165
|
If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
|
|
169
166
|
|
|
170
167
|
If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
|
|
@@ -177,23 +174,14 @@ def get_default_model_deployment_for_model(
|
|
|
177
174
|
ignore_deprecated: Whether to return None if the model deployment is deprecated.
|
|
178
175
|
"""
|
|
179
176
|
|
|
180
|
-
# If there is
|
|
181
|
-
if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
|
|
182
|
-
deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
|
|
183
|
-
if deployment.deprecated and ignore_deprecated:
|
|
184
|
-
if warn_arg_deprecated:
|
|
185
|
-
hlog(f"WARNING: Model deployment {model_name} is deprecated")
|
|
186
|
-
return None
|
|
187
|
-
return deployment.name
|
|
188
|
-
|
|
189
|
-
# If there is at least one deployment for the model, use the first one that is available.
|
|
177
|
+
# If there is at least one deployment for the model, use the last one that is available.
|
|
190
178
|
available_deployments: List[ModelDeployment] = [
|
|
191
179
|
deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
|
|
192
180
|
]
|
|
193
181
|
if len(available_deployments) > 0:
|
|
194
182
|
available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
|
|
195
183
|
if warn_arg_deprecated:
|
|
196
|
-
|
|
184
|
+
hwarn("Model name is deprecated. Please use the model deployment name instead.")
|
|
197
185
|
hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
|
|
198
186
|
|
|
199
187
|
# Additionally, if there is a non-deprecated deployment, use it.
|
|
@@ -201,19 +189,21 @@ def get_default_model_deployment_for_model(
|
|
|
201
189
|
deployment for deployment in available_deployments if not deployment.deprecated
|
|
202
190
|
]
|
|
203
191
|
if len(non_deprecated_deployments) > 0:
|
|
204
|
-
chosen_deployment = non_deprecated_deployments[
|
|
192
|
+
chosen_deployment = non_deprecated_deployments[-1]
|
|
205
193
|
# There are no non-deprecated deployments, so there are two options:
|
|
206
194
|
# 1. If we can return an empty string, return it. (no model deployment is available)
|
|
207
|
-
# 2. If we can't return an empty string, return the
|
|
195
|
+
# 2. If we can't return an empty string, return the last deployment (even if it's deprecated).
|
|
208
196
|
elif ignore_deprecated:
|
|
209
197
|
return None
|
|
210
|
-
|
|
211
|
-
chosen_deployment = available_deployments[
|
|
198
|
+
elif len(available_deployments) > 0:
|
|
199
|
+
chosen_deployment = available_deployments[-1]
|
|
212
200
|
if warn_arg_deprecated:
|
|
213
|
-
|
|
201
|
+
hwarn(f"All model deployments for model {model_name} are deprecated.")
|
|
202
|
+
else:
|
|
203
|
+
return None
|
|
214
204
|
if warn_arg_deprecated:
|
|
215
205
|
hlog(
|
|
216
|
-
f"Choosing {chosen_deployment.name} (the
|
|
206
|
+
f"Choosing {chosen_deployment.name} (the last one) as "
|
|
217
207
|
f"the default model deployment for model {model_name}"
|
|
218
208
|
)
|
|
219
209
|
hlog("If you want to use a different model deployment, please specify it explicitly.")
|
|
@@ -4,7 +4,7 @@ import dacite
|
|
|
4
4
|
import importlib_resources as resources
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from helm.common.hierarchical_logger import htrack, hlog
|
|
7
|
+
from helm.common.hierarchical_logger import htrack, hlog, hwarn
|
|
8
8
|
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
9
9
|
from helm.benchmark.presentation.schema import Schema
|
|
10
10
|
|
|
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
|
|
|
71
71
|
for point in contamination.points:
|
|
72
72
|
for model in point.models:
|
|
73
73
|
if model not in MODEL_NAME_TO_MODEL_METADATA:
|
|
74
|
-
|
|
74
|
+
hwarn(f"model {model} not defined in schema")
|
|
75
75
|
for group in point.groups:
|
|
76
76
|
if group not in schema.name_to_run_group:
|
|
77
|
-
|
|
77
|
+
hwarn(f"group {group} not defined in schema")
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
def read_contamination():
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
#
|
|
1
|
+
# type: ignore
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
2
5
|
import argparse
|
|
3
6
|
from collections import defaultdict
|
|
4
7
|
from dataclasses import dataclass
|
|
@@ -11,7 +14,7 @@ import numpy as np
|
|
|
11
14
|
from scipy.stats import pearsonr
|
|
12
15
|
|
|
13
16
|
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
|
|
14
|
-
from helm.common.hierarchical_logger import hlog
|
|
17
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
15
18
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
16
19
|
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
17
20
|
|
|
@@ -600,17 +603,7 @@ class Plotter:
|
|
|
600
603
|
self.create_constrast_set_plots()
|
|
601
604
|
|
|
602
605
|
|
|
603
|
-
def
|
|
604
|
-
"""
|
|
605
|
-
This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
|
|
606
|
-
It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
|
|
607
|
-
the top-level command `helm-create-plots`.
|
|
608
|
-
"""
|
|
609
|
-
parser = argparse.ArgumentParser()
|
|
610
|
-
parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
|
|
611
|
-
parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
|
|
612
|
-
parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
|
|
613
|
-
args = parser.parse_args()
|
|
606
|
+
def create_plots(args):
|
|
614
607
|
register_builtin_configs_from_helm_package()
|
|
615
608
|
base_path = os.path.join(args.output_path, "runs", args.suite)
|
|
616
609
|
if not os.path.exists(os.path.join(base_path, "groups")):
|
|
@@ -621,5 +614,42 @@ def main():
|
|
|
621
614
|
plotter.create_all_plots()
|
|
622
615
|
|
|
623
616
|
|
|
617
|
+
def main():
|
|
618
|
+
"""
|
|
619
|
+
This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
|
|
620
|
+
It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
|
|
621
|
+
the top-level command `helm-create-plots`.
|
|
622
|
+
"""
|
|
623
|
+
parser = argparse.ArgumentParser()
|
|
624
|
+
parser.add_argument(
|
|
625
|
+
"-o",
|
|
626
|
+
"--output-path",
|
|
627
|
+
type=str,
|
|
628
|
+
help="Path to benchmarking output",
|
|
629
|
+
default="benchmark_output",
|
|
630
|
+
)
|
|
631
|
+
parser.add_argument(
|
|
632
|
+
"--suite",
|
|
633
|
+
type=str,
|
|
634
|
+
help="Name of the suite that we are plotting",
|
|
635
|
+
required=True,
|
|
636
|
+
)
|
|
637
|
+
parser.add_argument(
|
|
638
|
+
"--plot-format",
|
|
639
|
+
help="Format for saving plots",
|
|
640
|
+
default="png",
|
|
641
|
+
choices=["png", "pdf"],
|
|
642
|
+
)
|
|
643
|
+
parser.add_argument(
|
|
644
|
+
"--log-config",
|
|
645
|
+
type=str,
|
|
646
|
+
default=None,
|
|
647
|
+
help="PATH to a YAML file to customize logging",
|
|
648
|
+
)
|
|
649
|
+
args = parser.parse_args()
|
|
650
|
+
setup_default_logging(args.log_config)
|
|
651
|
+
create_plots(args)
|
|
652
|
+
|
|
653
|
+
|
|
624
654
|
if __name__ == "__main__":
|
|
625
655
|
main()
|
|
@@ -59,6 +59,9 @@ class DisplayPrediction:
|
|
|
59
59
|
|
|
60
60
|
annotations: Optional[Dict[str, Any]]
|
|
61
61
|
|
|
62
|
+
thinking_text: Optional[str]
|
|
63
|
+
"""Thinking text from thinking models."""
|
|
64
|
+
|
|
62
65
|
|
|
63
66
|
@dataclass(frozen=True)
|
|
64
67
|
class DisplayRequest:
|
|
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
266
269
|
request_state.instance
|
|
267
270
|
)
|
|
268
271
|
|
|
272
|
+
if request_state.result.completions[0].multimodal_content:
|
|
273
|
+
additional_prediction: str = request_state.result.completions[0].multimodal_content.text
|
|
274
|
+
if additional_prediction:
|
|
275
|
+
predicted_text = f"{additional_prediction} {predicted_text}"
|
|
276
|
+
|
|
269
277
|
# Process images and include if they exist
|
|
270
278
|
images: List[str] = [
|
|
271
279
|
encode_base64(image_location)
|
|
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
273
281
|
if os.path.exists(image_location)
|
|
274
282
|
]
|
|
275
283
|
|
|
284
|
+
thinking_text: Optional[str] = (
|
|
285
|
+
request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
|
|
286
|
+
)
|
|
287
|
+
|
|
276
288
|
predictions.append(
|
|
277
289
|
DisplayPrediction(
|
|
278
290
|
instance_id=request_state.instance.id,
|
|
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
285
297
|
reference_index=request_state.reference_index,
|
|
286
298
|
stats=trial_stats,
|
|
287
299
|
annotations=request_state.annotations,
|
|
300
|
+
thinking_text=thinking_text,
|
|
288
301
|
)
|
|
289
302
|
)
|
|
290
303
|
requests.append(
|
|
@@ -11,6 +11,7 @@ import importlib_resources as resources
|
|
|
11
11
|
from helm.common.general import hlog
|
|
12
12
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
13
|
from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
|
|
14
|
+
from helm.common.hierarchical_logger import hwarn
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
# TODO: change to `helm.benchmark.config`
|
|
@@ -204,6 +205,11 @@ class RunGroup(Field):
|
|
|
204
205
|
# TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
|
|
205
206
|
adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
|
|
206
207
|
|
|
208
|
+
# Optional short description of the run group.
|
|
209
|
+
# This description is used in some space-constrained places in frontend tables.
|
|
210
|
+
# If unset, the description field will be used instead.
|
|
211
|
+
short_description: Optional[str] = None
|
|
212
|
+
|
|
207
213
|
|
|
208
214
|
@dataclass
|
|
209
215
|
class Schema:
|
|
@@ -281,5 +287,5 @@ def read_schema(schema_path: str) -> Schema:
|
|
|
281
287
|
raw = yaml.safe_load(f)
|
|
282
288
|
schema = dacite.from_dict(Schema, raw)
|
|
283
289
|
if schema.adapter:
|
|
284
|
-
|
|
290
|
+
hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
|
|
285
291
|
return dataclasses.replace(schema, adapter=get_adapter_fields())
|
|
@@ -30,7 +30,7 @@ from helm.common.general import (
|
|
|
30
30
|
unique_simplification,
|
|
31
31
|
)
|
|
32
32
|
from helm.common.codec import from_json
|
|
33
|
-
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
33
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
|
|
34
34
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
35
35
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
36
36
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
@@ -102,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
|
|
|
102
102
|
# This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
|
|
103
103
|
# as the main metric but multiple_choice_separate_original only generates exact_match
|
|
104
104
|
if matcher.name == "quasi_exact_match":
|
|
105
|
-
|
|
105
|
+
hwarn("No quasi_exact_match metric found, looking for exact_match instead")
|
|
106
106
|
matcher = replace(matcher, name="exact_match")
|
|
107
107
|
matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
|
|
108
108
|
if len(matching_stats) == 0:
|
|
@@ -294,7 +294,6 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
|
|
|
294
294
|
|
|
295
295
|
|
|
296
296
|
class AggregationStrategy:
|
|
297
|
-
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
298
297
|
WIN_RATE = "win_rate"
|
|
299
298
|
MEAN = "mean"
|
|
300
299
|
|
|
@@ -406,8 +405,8 @@ class Summarizer:
|
|
|
406
405
|
included = False
|
|
407
406
|
for run_group_name in run.run_spec.groups: # go through the groups of the run to determine visibility
|
|
408
407
|
if run_group_name not in self.schema.name_to_run_group:
|
|
409
|
-
|
|
410
|
-
f"
|
|
408
|
+
hwarn(
|
|
409
|
+
f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
|
|
411
410
|
f"but undefined in {self.schema_path}, skipping"
|
|
412
411
|
)
|
|
413
412
|
continue
|
|
@@ -440,14 +439,14 @@ class Summarizer:
|
|
|
440
439
|
run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
|
|
441
440
|
stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
|
|
442
441
|
if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
|
|
443
|
-
|
|
442
|
+
hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
|
|
444
443
|
continue
|
|
445
444
|
run_path: str = os.path.join(run_suite_path, run_dir_name)
|
|
446
445
|
run = self.read_run(run_path)
|
|
447
446
|
self.runs.append(run)
|
|
448
447
|
if run.run_spec.name in self.runs_to_run_suites:
|
|
449
|
-
|
|
450
|
-
f"
|
|
448
|
+
hwarn(
|
|
449
|
+
f"Run entry {run.run_spec.name} is present in two different Run Suites. "
|
|
451
450
|
f"Defaulting to the latest assigned suite: {suite}"
|
|
452
451
|
)
|
|
453
452
|
self.runs_to_run_suites[run.run_spec.name] = suite
|
|
@@ -544,8 +543,8 @@ class Summarizer:
|
|
|
544
543
|
|
|
545
544
|
for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
|
|
546
545
|
if metric_name not in defined_metric_names:
|
|
547
|
-
|
|
548
|
-
f"
|
|
546
|
+
hwarn(
|
|
547
|
+
f"metric name {metric_name} undefined in {self.schema_path} "
|
|
549
548
|
f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
|
|
550
549
|
)
|
|
551
550
|
|
|
@@ -738,8 +737,8 @@ class Summarizer:
|
|
|
738
737
|
if stat is None:
|
|
739
738
|
# Print out near misses to provide a more informative warning
|
|
740
739
|
near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
|
|
741
|
-
|
|
742
|
-
f"
|
|
740
|
+
hwarn(
|
|
741
|
+
f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
|
|
743
742
|
f"{len(near_misses)} near misses matching just the name"
|
|
744
743
|
)
|
|
745
744
|
if len(near_misses) > 0:
|
|
@@ -810,7 +809,7 @@ class Summarizer:
|
|
|
810
809
|
# Create header (cells to display) and the list of metric name filters
|
|
811
810
|
# (to pull out information later).
|
|
812
811
|
if not columns or not adapter_to_runs:
|
|
813
|
-
|
|
812
|
+
hwarn(f"table {title}, has no rows or columns, leaving empty")
|
|
814
813
|
return Table("empty", [], [])
|
|
815
814
|
|
|
816
815
|
header: List[HeaderCell] = []
|
|
@@ -831,7 +830,7 @@ class Summarizer:
|
|
|
831
830
|
matcher = replace(matcher, sub_split=sub_split)
|
|
832
831
|
header_field = self.schema.name_to_metric.get(matcher.name)
|
|
833
832
|
if header_field is None:
|
|
834
|
-
|
|
833
|
+
hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
|
|
835
834
|
continue
|
|
836
835
|
metadata = {
|
|
837
836
|
"metric": header_field.get_short_display_name(),
|
|
@@ -839,7 +838,8 @@ class Summarizer:
|
|
|
839
838
|
}
|
|
840
839
|
|
|
841
840
|
header_name = header_field.get_short_display_name()
|
|
842
|
-
|
|
841
|
+
run_group_short_description = run_group.short_description or run_group.description or ""
|
|
842
|
+
description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
|
|
843
843
|
(header_field.display_name if header_field.display_name else header_field.name)
|
|
844
844
|
+ ": "
|
|
845
845
|
+ (header_field.description if header_field.description is not None else "")
|
|
@@ -959,8 +959,8 @@ class Summarizer:
|
|
|
959
959
|
all_run_spec_names = []
|
|
960
960
|
for adapter_spec, runs in adapter_to_runs.items():
|
|
961
961
|
if len(runs) > 1:
|
|
962
|
-
|
|
963
|
-
f"
|
|
962
|
+
hwarn(
|
|
963
|
+
f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
|
|
964
964
|
f" {[run.run_spec.name for run in runs]}"
|
|
965
965
|
)
|
|
966
966
|
for run in runs:
|
|
@@ -1232,10 +1232,57 @@ class Summarizer:
|
|
|
1232
1232
|
|
|
1233
1233
|
|
|
1234
1234
|
@htrack("summarize")
|
|
1235
|
+
def summarize(args):
|
|
1236
|
+
release: Optional[str] = None
|
|
1237
|
+
suites: Optional[str] = None
|
|
1238
|
+
suite: Optional[str] = None
|
|
1239
|
+
if args.suite and (args.release or args.suites):
|
|
1240
|
+
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
|
|
1241
|
+
elif args.suite:
|
|
1242
|
+
# Comment this out while we have a trial period for the `release` method.
|
|
1243
|
+
# hlog(
|
|
1244
|
+
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
|
|
1245
|
+
# "where --release specifies the name of a release and --suites specifies several run suites "
|
|
1246
|
+
# "to be included in that release."
|
|
1247
|
+
# )
|
|
1248
|
+
suite = args.suite
|
|
1249
|
+
elif args.release or args.suites:
|
|
1250
|
+
if not args.release or not args.suites:
|
|
1251
|
+
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
|
|
1252
|
+
release = args.release
|
|
1253
|
+
suites = args.suites
|
|
1254
|
+
else:
|
|
1255
|
+
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1256
|
+
|
|
1257
|
+
schema_path = args.schema_path if args.schema_path else get_default_schema_path()
|
|
1258
|
+
|
|
1259
|
+
register_builtin_configs_from_helm_package()
|
|
1260
|
+
register_configs_from_directory(args.local_path)
|
|
1261
|
+
|
|
1262
|
+
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1263
|
+
summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
|
|
1264
|
+
summarizer = summarizer_cls(
|
|
1265
|
+
release=release,
|
|
1266
|
+
suites=suites,
|
|
1267
|
+
suite=suite,
|
|
1268
|
+
schema_path=schema_path,
|
|
1269
|
+
output_path=args.output_path,
|
|
1270
|
+
verbose=args.debug,
|
|
1271
|
+
num_threads=args.num_threads,
|
|
1272
|
+
allow_unknown_models=args.allow_unknown_models,
|
|
1273
|
+
)
|
|
1274
|
+
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
|
|
1275
|
+
hlog("Done.")
|
|
1276
|
+
|
|
1277
|
+
|
|
1235
1278
|
def main():
|
|
1236
1279
|
parser = argparse.ArgumentParser()
|
|
1237
1280
|
parser.add_argument(
|
|
1238
|
-
"-o",
|
|
1281
|
+
"-o",
|
|
1282
|
+
"--output-path",
|
|
1283
|
+
type=str,
|
|
1284
|
+
help="Where the benchmarking output lives",
|
|
1285
|
+
default="benchmark_output",
|
|
1239
1286
|
)
|
|
1240
1287
|
parser.add_argument(
|
|
1241
1288
|
"--schema-path",
|
|
@@ -1253,9 +1300,18 @@ def main():
|
|
|
1253
1300
|
help="Experimental: Name of the release this summarization should go under.",
|
|
1254
1301
|
)
|
|
1255
1302
|
parser.add_argument(
|
|
1256
|
-
"--suites",
|
|
1303
|
+
"--suites",
|
|
1304
|
+
type=str,
|
|
1305
|
+
nargs="+",
|
|
1306
|
+
help="Experimental: List of suites to summarize for this this release.",
|
|
1307
|
+
)
|
|
1308
|
+
parser.add_argument(
|
|
1309
|
+
"-n",
|
|
1310
|
+
"--num-threads",
|
|
1311
|
+
type=int,
|
|
1312
|
+
help="Max number of threads used to summarize",
|
|
1313
|
+
default=8,
|
|
1257
1314
|
)
|
|
1258
|
-
parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
|
|
1259
1315
|
parser.add_argument(
|
|
1260
1316
|
"--debug",
|
|
1261
1317
|
action="store_true",
|
|
@@ -1284,48 +1340,15 @@ def main():
|
|
|
1284
1340
|
default=None,
|
|
1285
1341
|
help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
|
|
1286
1342
|
)
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
if args.suite and (args.release or args.suites):
|
|
1293
|
-
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
|
|
1294
|
-
elif args.suite:
|
|
1295
|
-
# Comment this out while we have a trial period for the `release` method.
|
|
1296
|
-
# hlog(
|
|
1297
|
-
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
|
|
1298
|
-
# "where --release specifies the name of a release and --suites specifies several run suites "
|
|
1299
|
-
# "to be included in that release."
|
|
1300
|
-
# )
|
|
1301
|
-
suite = args.suite
|
|
1302
|
-
elif args.release or args.suites:
|
|
1303
|
-
if not args.release or not args.suites:
|
|
1304
|
-
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
|
|
1305
|
-
release = args.release
|
|
1306
|
-
suites = args.suites
|
|
1307
|
-
else:
|
|
1308
|
-
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1309
|
-
|
|
1310
|
-
schema_path = args.schema_path if args.schema_path else get_default_schema_path()
|
|
1311
|
-
|
|
1312
|
-
register_builtin_configs_from_helm_package()
|
|
1313
|
-
register_configs_from_directory(args.local_path)
|
|
1314
|
-
|
|
1315
|
-
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1316
|
-
summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
|
|
1317
|
-
summarizer = summarizer_cls(
|
|
1318
|
-
release=release,
|
|
1319
|
-
suites=suites,
|
|
1320
|
-
suite=suite,
|
|
1321
|
-
schema_path=schema_path,
|
|
1322
|
-
output_path=args.output_path,
|
|
1323
|
-
verbose=args.debug,
|
|
1324
|
-
num_threads=args.num_threads,
|
|
1325
|
-
allow_unknown_models=args.allow_unknown_models,
|
|
1343
|
+
parser.add_argument(
|
|
1344
|
+
"--log-config",
|
|
1345
|
+
type=str,
|
|
1346
|
+
default=None,
|
|
1347
|
+
help="PATH to a YAML file to customize logging",
|
|
1326
1348
|
)
|
|
1327
|
-
|
|
1328
|
-
|
|
1349
|
+
args = parser.parse_args()
|
|
1350
|
+
setup_default_logging(args.log_config)
|
|
1351
|
+
summarize(args)
|
|
1329
1352
|
|
|
1330
1353
|
|
|
1331
1354
|
if __name__ == "__main__":
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
#
|
|
1
|
+
# type: ignore
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
2
5
|
from helm.common.general import asdict_without_nones
|
|
3
6
|
from helm.benchmark.presentation.table import Table, Cell, HeaderCell
|
|
4
7
|
from helm.benchmark.presentation.create_plots import parse_table
|
helm/benchmark/reeval_run.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import List
|
|
|
6
6
|
from helm.benchmark import model_metadata_registry
|
|
7
7
|
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
8
8
|
from helm.common.general import ensure_directory_exists
|
|
9
|
-
from helm.common.hierarchical_logger import hlog, htrack
|
|
9
|
+
from helm.common.hierarchical_logger import hlog, htrack, hwarn
|
|
10
10
|
from helm.common.authentication import Authentication
|
|
11
11
|
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
12
12
|
|
|
@@ -191,9 +191,8 @@ def main():
|
|
|
191
191
|
)
|
|
192
192
|
|
|
193
193
|
if args.run_specs:
|
|
194
|
-
|
|
195
|
-
"
|
|
196
|
-
"Use --run-entries instead."
|
|
194
|
+
hwarn(
|
|
195
|
+
"The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
|
|
197
196
|
)
|
|
198
197
|
|
|
199
198
|
hlog("Done.")
|
helm/benchmark/reeval_runner.py
CHANGED
|
@@ -12,7 +12,7 @@ from datasets import load_dataset
|
|
|
12
12
|
|
|
13
13
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
14
14
|
from helm.common.general import ensure_directory_exists, write, asdict_without_nones
|
|
15
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
15
|
+
from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
|
|
16
16
|
from helm.common.cache import cache_stats
|
|
17
17
|
from helm.benchmark.scenarios.scenario import (
|
|
18
18
|
Scenario,
|
|
@@ -193,7 +193,7 @@ class REEvalRunner(Runner):
|
|
|
193
193
|
difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
|
|
194
194
|
prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
|
|
195
195
|
except ValueError:
|
|
196
|
-
|
|
196
|
+
hwarn(f"no available difficulty for {split_name}, skipping")
|
|
197
197
|
return
|
|
198
198
|
|
|
199
199
|
unasked_request_states: List[RequestState] = []
|
|
@@ -320,7 +320,7 @@ class REEvalRunner(Runner):
|
|
|
320
320
|
metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
|
|
321
321
|
for metric_name, count in metric_counts.items():
|
|
322
322
|
if count > 1:
|
|
323
|
-
|
|
323
|
+
hwarn(f"duplicate metric name {metric_name}")
|
|
324
324
|
|
|
325
325
|
# Print out the number of stats
|
|
326
326
|
hlog(f"Generated {len(stats)} stats.")
|