crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/classification_metrics.py +19 -1
  25. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  26. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  27. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  28. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  29. helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
  30. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  31. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  32. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  33. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  34. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  35. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  36. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  37. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  38. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  39. helm/benchmark/metrics/medec_metrics.py +25 -2
  40. helm/benchmark/metrics/metric.py +25 -0
  41. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  42. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  43. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  44. helm/benchmark/metrics/summac/model_summac.py +2 -2
  45. helm/benchmark/metrics/summarization_metrics.py +129 -1
  46. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  47. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  48. helm/benchmark/presentation/schema.py +5 -22
  49. helm/benchmark/presentation/summarize.py +180 -11
  50. helm/benchmark/presentation/taxonomy_info.py +20 -0
  51. helm/benchmark/run_expander.py +4 -0
  52. helm/benchmark/run_specs/arabic_run_specs.py +134 -16
  53. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  54. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  55. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  56. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  57. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  58. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  59. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  60. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  61. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  62. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  63. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  64. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  65. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  66. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  67. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  68. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  69. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  70. helm/benchmark/scenarios/bold_scenario.py +15 -0
  71. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  72. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  73. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  74. helm/benchmark/scenarios/clear_scenario.py +23 -0
  75. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  76. helm/benchmark/scenarios/code_scenario.py +28 -0
  77. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  78. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  79. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  80. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  81. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  82. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  83. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  84. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  85. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  86. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  87. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  88. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  89. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  90. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  91. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  92. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  93. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  94. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  95. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  96. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  97. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  98. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  99. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  100. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  101. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  102. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  103. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  104. helm/benchmark/scenarios/ice_scenario.py +21 -1
  105. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  106. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  107. helm/benchmark/scenarios/koala_scenario.py +21 -1
  108. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  109. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  110. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  111. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  112. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  113. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  114. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  115. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  116. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  117. helm/benchmark/scenarios/math_scenario.py +26 -0
  118. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  119. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  120. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  121. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  122. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  123. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  124. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  125. helm/benchmark/scenarios/medec_scenario.py +23 -0
  126. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  127. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  128. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  129. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  130. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  131. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  132. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  133. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  134. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  135. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  136. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  137. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  138. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  139. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  140. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  141. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  142. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  143. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  144. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  145. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  146. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  147. helm/benchmark/scenarios/quac_scenario.py +14 -0
  148. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  149. helm/benchmark/scenarios/raft_scenario.py +15 -0
  150. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  151. helm/benchmark/scenarios/scenario.py +31 -0
  152. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  153. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  154. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  155. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  156. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  157. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  158. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  159. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  160. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  161. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  162. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  163. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  164. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  165. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  166. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  167. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  168. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  169. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  170. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  171. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  172. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  173. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  175. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  176. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  177. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  178. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  179. helm/benchmark/static/schema_arabic.yaml +55 -12
  180. helm/benchmark/static/schema_long_context.yaml +17 -17
  181. helm/benchmark/static/schema_medhelm.yaml +36 -0
  182. helm/benchmark/static/schema_slp.yaml +219 -0
  183. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  184. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  185. helm/benchmark/static_build/index.html +2 -2
  186. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  187. helm/clients/audio_language/llama_omni/constants.py +9 -0
  188. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  189. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  190. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  191. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  192. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  193. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  194. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  195. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  196. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  197. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  198. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  199. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  200. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  201. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  202. helm/clients/audio_language/llama_omni/utils.py +202 -0
  203. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  204. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  205. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  206. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  207. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  208. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  209. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  210. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  211. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  212. helm/clients/openai_client.py +31 -19
  213. helm/clients/openai_responses_client.py +27 -3
  214. helm/clients/openrouter_client.py +31 -0
  215. helm/clients/test_openrouter_client.py +69 -0
  216. helm/clients/together_client.py +48 -11
  217. helm/clients/vertexai_client.py +8 -2
  218. helm/config/model_deployments.yaml +75 -1
  219. helm/config/model_metadata.yaml +70 -2
  220. helm/config/tokenizer_configs.yaml +19 -1
  221. helm/proxy/example_queries.py +8 -8
  222. helm/proxy/server.py +2 -1
  223. helm/proxy/static/index.css +4 -0
  224. helm/proxy/static/index.js +7 -1
  225. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  226. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  227. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  228. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  229. helm/benchmark/metrics/medalign_metrics.py +0 -14
  230. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  231. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  232. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  233. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  234. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  235. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  236. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  237. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  238. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  239. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  240. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  241. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  242. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  243. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  from typing import List
2
2
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
- from helm.benchmark.metrics.metric import Metric
4
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
5
5
  from helm.benchmark.metrics.metric_name import MetricName
6
6
  from helm.benchmark.metrics.metric_service import MetricService
7
7
  from helm.benchmark.metrics.statistic import Stat
@@ -94,3 +94,34 @@ class MIMICIVBillingCodeMetric(Metric):
94
94
  Stat(MetricName("mimiciv_billing_code_recall")).add(recall),
95
95
  Stat(MetricName("mimiciv_billing_code_f1")).add(f1),
96
96
  ]
97
+
98
+ def get_metadata(self) -> List[MetricMetadata]:
99
+ return [
100
+ MetricMetadata(
101
+ name="mimiciv_billing_code_precision",
102
+ display_name="Precision for MIMIC Billing Codes",
103
+ short_display_name="MIMICBillingPre",
104
+ description="Measures the proportion of correctly predicted ICD codes among all ICD codes predicted by "
105
+ "the model.",
106
+ lower_is_better=False,
107
+ group=None,
108
+ ),
109
+ MetricMetadata(
110
+ name="mimiciv_billing_code_recall",
111
+ display_name="Recall for MIMIC Billing Codes",
112
+ short_display_name="MIMICBillingRec",
113
+ description="Measures the proportion of correctly predicted ICD codes among all ICD codes present in "
114
+ "the gold standard.",
115
+ lower_is_better=False,
116
+ group=None,
117
+ ),
118
+ MetricMetadata(
119
+ name="mimiciv_billing_code_f1",
120
+ display_name="F1 Score for MIMIC Billing Codes",
121
+ short_display_name="MIMICBillingF1",
122
+ description="Measures the harmonic mean of precision and recall for ICD codes, providing a balanced "
123
+ "evaluation of the model's performance.",
124
+ lower_is_better=False,
125
+ group=None,
126
+ ),
127
+ ]
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
- from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
6
6
  from helm.benchmark.metrics.metric_name import MetricName
7
7
  from helm.benchmark.metrics.metric_service import MetricService
8
8
  from helm.benchmark.metrics.statistic import Stat
@@ -30,3 +30,15 @@ class OmniMATHMetric(Metric):
30
30
  return [
31
31
  Stat(MetricName("omni_math_accuracy")).add(score),
32
32
  ]
33
+
34
+ def get_metadata(self) -> List[MetricMetadata]:
35
+ return [
36
+ MetricMetadata(
37
+ name="omni_math_accuracy",
38
+ display_name="Omni-MATH Accuracy",
39
+ short_display_name="Acc",
40
+ description="Accuracy of the AI output judged by GPT-4.",
41
+ lower_is_better=False,
42
+ group="accuracy",
43
+ ),
44
+ ]
@@ -8,7 +8,7 @@ from sacrebleu.metrics import CHRF
8
8
 
9
9
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
10
10
  from helm.benchmark.adaptation.request_state import RequestState
11
- from helm.benchmark.metrics.metric import Metric
11
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
12
12
  from helm.benchmark.metrics.metric_name import MetricName
13
13
  from helm.benchmark.metrics.metric_service import MetricService
14
14
  from helm.benchmark.metrics.statistic import Stat
@@ -73,6 +73,19 @@ class SEAHELMMachineTranslationMetric(Metric):
73
73
 
74
74
  return result
75
75
 
76
+ def get_metadata(self) -> List[MetricMetadata]:
77
+ return [
78
+ MetricMetadata(
79
+ name="chr_f_plus_plus",
80
+ display_name="ChrF++",
81
+ description="Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, "
82
+ "2015)](https://aclanthology.org/W15-3049/). Code can be found "
83
+ "[here](https://github.com/mjpost/sacrebleu).",
84
+ lower_is_better=False,
85
+ group=None,
86
+ ),
87
+ ]
88
+
76
89
 
77
90
  class SEAHELMQAMetric(Metric):
78
91
  """SEAHELM QA Metrics
@@ -219,7 +219,7 @@ class SummaCConv(torch.nn.Module):
219
219
  imager_load_cache=True,
220
220
  agg="mean",
221
221
  norm_histo=False,
222
- **kwargs
222
+ **kwargs,
223
223
  ):
224
224
  # `bins` should be `even%d` or `percentiles`
225
225
  assert nli_labels in ["e", "c", "n", "ec", "en", "cn", "ecn"], "Unrecognized nli_labels argument %s" % (
@@ -405,7 +405,7 @@ class SummaCZS:
405
405
  use_con=True,
406
406
  imager_load_cache=True,
407
407
  device="cuda",
408
- **kwargs
408
+ **kwargs,
409
409
  ):
410
410
  assert op2 in ["min", "mean", "max"], "Unrecognized `op2`"
411
411
  assert op1 in ["max", "mean", "min"], "Unrecognized `op1`"
@@ -16,7 +16,7 @@ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
16
16
  from helm.common.hierarchical_logger import hlog
17
17
  from helm.common.general import ensure_file_downloaded
18
18
  from helm.common.optional_dependencies import handle_module_not_found_error
19
- from helm.benchmark.metrics.metric import Metric, MetricResult
19
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata, MetricResult
20
20
  from helm.benchmark.metrics.metric_name import MetricName
21
21
  from helm.benchmark.metrics.metric_service import MetricService
22
22
  from helm.benchmark.metrics.statistic import Stat
@@ -240,6 +240,134 @@ class SummarizationMetric(Metric):
240
240
 
241
241
  return result
242
242
 
243
+ def get_metadata(self):
244
+ metadata: List[MetricMetadata] = [
245
+ MetricMetadata(
246
+ name="QAFactEval",
247
+ display_name="QAFactEval",
248
+ description="Faithfulness scores based on the SummaC method of [Laban et al. "
249
+ "(2022)](https://aclanthology.org/2022.tacl-1.10/).",
250
+ lower_is_better=False,
251
+ group="summarization_metrics",
252
+ ),
253
+ MetricMetadata(
254
+ name="summarization_coverage",
255
+ display_name="Coverage",
256
+ description="Extent to which the model-generated summaries are extractive fragments from the source "
257
+ "document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
258
+ lower_is_better=None,
259
+ group="summarization_metrics",
260
+ ),
261
+ MetricMetadata(
262
+ name="summarization_density",
263
+ display_name="Density",
264
+ description="Extent to which the model-generated summaries are extractive summaries based on the "
265
+ "source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
266
+ lower_is_better=None,
267
+ group="summarization_metrics",
268
+ ),
269
+ MetricMetadata(
270
+ name="summarization_compression",
271
+ display_name="Compression",
272
+ description="Extent to which the model-generated summaries are compressed relative to the source "
273
+ "document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).",
274
+ lower_is_better=None,
275
+ group="summarization_metrics",
276
+ ),
277
+ MetricMetadata(
278
+ name="rouge_1",
279
+ display_name="ROUGE-1",
280
+ short_display_name="ROUGE-1",
281
+ description="ROUGE-1",
282
+ lower_is_better=False,
283
+ group="summarization_metrics",
284
+ ),
285
+ MetricMetadata(
286
+ name="rouge-2",
287
+ display_name="ROUGE-2",
288
+ short_display_name="ROUGE-2",
289
+ description="ROUGE-2",
290
+ lower_is_better=False,
291
+ group="summarization_metrics",
292
+ ),
293
+ MetricMetadata(
294
+ name="rouge-l",
295
+ display_name="ROUGE-L",
296
+ short_display_name="ROUGE-L",
297
+ description="ROUGE-L",
298
+ lower_is_better=False,
299
+ group="summarization_metrics",
300
+ ),
301
+ ]
302
+ if self.humaneval is not None:
303
+ metadata.extend(
304
+ [
305
+ MetricMetadata(
306
+ name="HumanEval-faithfulness",
307
+ display_name="HumanEval-faithfulness",
308
+ description="Human evaluation score for faithfulness.",
309
+ lower_is_better=False,
310
+ group="summarization_metrics",
311
+ ),
312
+ MetricMetadata(
313
+ name="HumanEval-relevance",
314
+ display_name="HumanEval-relevance",
315
+ description="Human evaluation score for relevance.",
316
+ lower_is_better=False,
317
+ group="summarization_metrics",
318
+ ),
319
+ MetricMetadata(
320
+ name="HumanEval-coherence",
321
+ display_name="HumanEval-coherence",
322
+ description="Human evaluation score for coherence.",
323
+ lower_is_better=False,
324
+ group="summarization_metrics",
325
+ ),
326
+ ]
327
+ )
328
+ if self.compute_faithfulness:
329
+ metadata.append(
330
+ MetricMetadata(
331
+ name="summac",
332
+ display_name="SummaC",
333
+ description="Faithfulness scores based on the SummaC method of [Laban et al. "
334
+ "(2022)](https://aclanthology.org/2022.tacl-1.10/).",
335
+ lower_is_better=False,
336
+ group="summarization_metrics",
337
+ )
338
+ )
339
+ if self.compute_bertscore:
340
+ metadata.extend(
341
+ [
342
+ MetricMetadata(
343
+ name="BERTScore-P",
344
+ display_name="BERTScore (P)",
345
+ description="Average BERTScore precision [(Zhang et al., "
346
+ "2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference "
347
+ "summary.",
348
+ lower_is_better=False,
349
+ group=None,
350
+ ),
351
+ MetricMetadata(
352
+ name="BERTScore-R",
353
+ display_name="BERTScore (R)",
354
+ description="Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
355
+ "between model generation and reference summary.",
356
+ lower_is_better=False,
357
+ group=None,
358
+ ),
359
+ MetricMetadata(
360
+ name="BERTScore-F",
361
+ display_name="BERTScore (F1)",
362
+ description="Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) "
363
+ "between model generation and reference summary.",
364
+ lower_is_better=False,
365
+ group="summarization_metrics",
366
+ ),
367
+ ]
368
+ )
369
+ return metadata
370
+
243
371
 
244
372
  def _paired_bootstrap_test(treatment_list: list, control_list: list, nboot: int = 10000):
245
373
  """
@@ -6,7 +6,7 @@ from helm.common.hierarchical_logger import hlog, hwarn
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
8
  from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
9
- from helm.benchmark.metrics.metric import Metric
9
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
10
10
  from helm.benchmark.metrics.metric_name import MetricName
11
11
  from helm.benchmark.metrics.metric_service import MetricService
12
12
  from helm.benchmark.metrics.statistic import Stat
@@ -89,3 +89,33 @@ class ToxicityMetric(Metric):
89
89
  ]
90
90
 
91
91
  return stats
92
+
93
+ def get_metadata(self) -> List[MetricMetadata]:
94
+ return [
95
+ MetricMetadata(
96
+ name="expected_max_toxicity",
97
+ display_name="Average max toxicity",
98
+ short_display_name="Avg. max toxicity",
99
+ description="Average max toxicity score across all completions (based on the PerspectiveAPI toxicity "
100
+ "classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
101
+ lower_is_better=True,
102
+ group=None,
103
+ ),
104
+ MetricMetadata(
105
+ name="max_toxicity_probability",
106
+ display_name="max toxicity",
107
+ description="Fraction of instances where most toxic completion is above 0.5 toxicity (based on the "
108
+ "PerspectiveAPI toxicity classifier; [Gehman et al. "
109
+ "(2020)](https://aclanthology.org/2020.findings-emnlp.301/)).",
110
+ lower_is_better=True,
111
+ group=None,
112
+ ),
113
+ MetricMetadata(
114
+ name="toxic_frac",
115
+ display_name="Toxic fraction",
116
+ description="Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity "
117
+ "classifier).",
118
+ lower_is_better=True,
119
+ group="toxicity",
120
+ ),
121
+ ]
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
- from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric import Metric, MetricMetadata
6
6
  from helm.benchmark.metrics.metric_name import MetricName
7
7
  from helm.benchmark.metrics.metric_service import MetricService
8
8
  from helm.benchmark.metrics.statistic import Stat
@@ -32,3 +32,23 @@ class WildBenchScoreMetric(Metric):
32
32
  Stat(MetricName("wildbench_score")).add(score),
33
33
  Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
34
34
  ]
35
+
36
+ def get_metadata(self) -> List[MetricMetadata]:
37
+ return [
38
+ MetricMetadata(
39
+ name="wildbench_score",
40
+ display_name="WildBench Score",
41
+ short_display_name="WB Score",
42
+ description="Score of the AI output judged by GPT-4o.",
43
+ lower_is_better=False,
44
+ group="accuracy",
45
+ ),
46
+ MetricMetadata(
47
+ name="wildbench_score_rescaled",
48
+ display_name="WildBench Score",
49
+ short_display_name="WB Score",
50
+ description="Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.",
51
+ lower_is_better=False,
52
+ group="accuracy",
53
+ ),
54
+ ]
@@ -8,6 +8,7 @@ import mako.template
8
8
  import yaml
9
9
  import importlib_resources as resources
10
10
 
11
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
11
12
  from helm.common.general import hlog
12
13
  from helm.benchmark.metrics.metric_name import MetricName
13
14
  from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
@@ -131,24 +132,6 @@ THIS_GROUP_ONLY = "this_group_only"
131
132
  NO_GROUPS = "no_groups"
132
133
 
133
134
 
134
- @dataclass(frozen=True)
135
- class TaxonomyInfo:
136
- # Task (e.g., question answering)
137
- task: Optional[str] = None
138
-
139
- # Domain - genre (e.g., Wikipedia)
140
- what: Optional[str] = None
141
-
142
- # Domain - when it was written (e.g., 2010s)
143
- when: Optional[str] = None
144
-
145
- # Domain - demographics (e.g., web users)
146
- who: Optional[str] = None
147
-
148
- # Language (e.g., English)
149
- language: Optional[str] = None
150
-
151
-
152
135
  @dataclass(frozen=True)
153
136
  class RunGroup(Field):
154
137
  """
@@ -216,16 +199,16 @@ class Schema:
216
199
  """Specifies information about what to display on the frontend."""
217
200
 
218
201
  # Information about each field
219
- metrics: List[Field]
202
+ metrics: List[Field] = field(default_factory=list)
220
203
 
221
204
  # Information about each perturbation
222
- perturbations: List[Field]
205
+ perturbations: List[Field] = field(default_factory=list)
223
206
 
224
207
  # Group the metrics
225
- metric_groups: List[MetricGroup]
208
+ metric_groups: List[MetricGroup] = field(default_factory=list)
226
209
 
227
210
  # Group the scenarios
228
- run_groups: List[RunGroup]
211
+ run_groups: List[RunGroup] = field(default_factory=list)
229
212
 
230
213
  # Adapter fields (e.g., temperature)
231
214
  # Automatically populated from the docstrings in the AdapterSpec class definition.
@@ -9,6 +9,7 @@ Usage:
9
9
  """
10
10
 
11
11
  import argparse
12
+ import dataclasses
12
13
  import os
13
14
  import datetime
14
15
  import urllib.parse
@@ -31,18 +32,26 @@ from helm.common.general import (
31
32
  )
32
33
  from helm.common.codec import from_json
33
34
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
34
- from helm.benchmark.scenarios.scenario import ScenarioSpec
35
+ from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
35
36
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
36
37
  from helm.benchmark.metrics.metric_name import MetricName
37
- from helm.benchmark.metrics.metric import get_all_stats_by_name
38
+ from helm.benchmark.metrics.metric import (
39
+ MetricInterface,
40
+ MetricMetadata,
41
+ MetricSpec,
42
+ create_metric,
43
+ get_all_stats_by_name,
44
+ )
38
45
  from helm.benchmark.metrics.statistic import Stat, merge_stat
39
46
  from helm.benchmark.run_spec import RunSpec
40
47
  from helm.benchmark.runner import LATEST_SYMLINK
41
48
  from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
42
49
  from helm.benchmark.presentation.schema import (
50
+ MetricGroup,
43
51
  MetricNameMatcher,
44
52
  RunGroup,
45
53
  Field,
54
+ Schema,
46
55
  read_schema,
47
56
  get_default_schema_path,
48
57
  BY_GROUP,
@@ -341,7 +350,7 @@ class Summarizer:
341
350
  release: Optional[str],
342
351
  suites: Optional[List[str]],
343
352
  suite: Optional[str],
344
- schema_path: str,
353
+ schema_path: Optional[str],
345
354
  output_path: str,
346
355
  verbose: bool,
347
356
  num_threads: int,
@@ -376,10 +385,8 @@ class Summarizer:
376
385
  self.verbose: bool = verbose
377
386
  self.num_threads: int = num_threads
378
387
  self.allow_unknown_models: bool = allow_unknown_models
379
-
380
- ensure_directory_exists(self.run_release_path)
381
-
382
- self.schema = read_schema(schema_path)
388
+ self.schema = read_schema(schema_path) if schema_path else Schema()
389
+ self.metric_metadata: List[MetricMetadata] = []
383
390
 
384
391
  def read_run(self, run_path: str) -> Run:
385
392
  """Load the `Run` object from `run_path`."""
@@ -426,6 +433,8 @@ class Summarizer:
426
433
 
427
434
  def read_runs_for_suite(self, suite, run_suite_path):
428
435
  """Load the runs in the run suite path."""
436
+ if not os.path.exists(run_suite_path):
437
+ raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
429
438
  # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
430
439
  # so filter them out.
431
440
  run_dir_names = sorted(
@@ -509,6 +518,150 @@ class Summarizer:
509
518
  model_field_dicts.append(asdict_without_nones(model_field))
510
519
  return model_field_dicts
511
520
 
521
+ def get_metric_metadata(self) -> List[MetricMetadata]:
522
+ if self.metric_metadata:
523
+ return self.metric_metadata
524
+ metric_specs: List[MetricSpec] = []
525
+ for run in self.runs:
526
+ metric_specs.extend(run.run_spec.metric_specs)
527
+ metric_specs = list(set(metric_specs))
528
+ metric_name_to_metadata: Dict[str, MetricMetadata] = {}
529
+ for metric_spec in metric_specs:
530
+ try:
531
+ metric: MetricInterface = create_metric(metric_spec)
532
+ metric_metadata_list = metric.get_metadata()
533
+ for metric_metadata in metric_metadata_list:
534
+ metric_name_to_metadata[metric_metadata.name] = metric_metadata
535
+ except NotImplementedError:
536
+ pass
537
+ except (ModuleNotFoundError, AttributeError, TypeError):
538
+ pass
539
+
540
+ run_stat_names: Set[str] = set()
541
+ for run in self.runs:
542
+ for stat in run.stats:
543
+ run_stat_names.add(stat.name.name)
544
+
545
+ metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
546
+ for metric_name_to_prune in metric_names_to_prune:
547
+ del metric_name_to_metadata[metric_name_to_prune]
548
+ self.metric_metadata = list(metric_name_to_metadata.values())
549
+ return self.metric_metadata
550
+
551
+ def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
552
+ return Field(
553
+ name=metric_metadata.name,
554
+ display_name=metric_metadata.display_name,
555
+ short_display_name=metric_metadata.short_display_name,
556
+ description=metric_metadata.description,
557
+ lower_is_better=metric_metadata.lower_is_better,
558
+ )
559
+
560
+ def auto_generate_metric_fields(self) -> List[Field]:
561
+ return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
562
+
563
+ def auto_generate_metric_groups(self) -> List[MetricGroup]:
564
+ metric_groups = [
565
+ MetricGroup(
566
+ name="main_metric",
567
+ display_name="Main Metric",
568
+ description="Main Metric",
569
+ metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
570
+ )
571
+ ]
572
+ metric_group_to_metrics: Dict[str, List[str]] = {}
573
+ for metric_metadata in self.metric_metadata:
574
+ if metric_metadata.group:
575
+ if metric_metadata.group not in metric_group_to_metrics:
576
+ metric_group_to_metrics[metric_metadata.group] = []
577
+ metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
578
+ for metric_group, metric_names in metric_group_to_metrics.items():
579
+ display_name = metric_group.replace("_", " ").capitalize()
580
+ metric_groups.append(
581
+ MetricGroup(
582
+ name=metric_group,
583
+ # TODO: Make display_name and description nicer
584
+ display_name=display_name,
585
+ description=display_name,
586
+ aggregation_strategies=[],
587
+ metrics=[
588
+ MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
589
+ ],
590
+ )
591
+ )
592
+ return metric_groups
593
+
594
+ def get_scenario_metadata(self) -> List[ScenarioMetadata]:
595
+ scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
596
+ scenario_specs = list(set(scenario_specs))
597
+ scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
598
+ for scenario_spec in scenario_specs:
599
+ try:
600
+ scenario: Scenario = create_scenario(scenario_spec)
601
+ scenario_metadata = scenario.get_metadata()
602
+ scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
603
+ except NotImplementedError:
604
+ pass
605
+ except (ModuleNotFoundError, AttributeError, TypeError):
606
+ pass
607
+
608
+ run_groups: Set[str] = set()
609
+ for run in self.runs:
610
+ for run_group in run.run_spec.groups:
611
+ run_groups.add(run_group)
612
+
613
+ scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
614
+ for scenario_name_to_prune in scenario_names_to_prune:
615
+ del scenario_name_to_metadata[scenario_name_to_prune]
616
+ return list(scenario_name_to_metadata.values())
617
+
618
+ def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
619
+ metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
620
+ return RunGroup(
621
+ name=scenario_metadata.name,
622
+ display_name=scenario_metadata.display_name,
623
+ short_display_name=scenario_metadata.short_display_name,
624
+ description=scenario_metadata.description,
625
+ metric_groups=metric_group_names,
626
+ environment={
627
+ "main_name": scenario_metadata.main_metric,
628
+ "main_split": scenario_metadata.main_split,
629
+ },
630
+ taxonomy=scenario_metadata.taxonomy,
631
+ )
632
+
633
+ def auto_generate_all_scenarios_run_group(self) -> RunGroup:
634
+ return RunGroup(
635
+ name="all_scenarios",
636
+ display_name="All Scenarios",
637
+ description="All scenarios",
638
+ category="Scenario Groups",
639
+ subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
640
+ )
641
+
642
+ def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
643
+ return [
644
+ self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
645
+ ]
646
+
647
+ def fix_up_schema(self) -> None:
648
+ # if not self.schema.run_groups:
649
+ if not self.schema.metrics:
650
+ self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
651
+ # Can only auto-generate metric groups if metrics were also auto-generated
652
+ # because auto_generate_metric_groups() requires self.metric_metadata()
653
+ # which is populated by auto_generate_metric_fields()
654
+ if not self.schema.metric_groups:
655
+ self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
656
+ if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
657
+ self.schema = dataclasses.replace(
658
+ self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
659
+ )
660
+ if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
661
+ self.schema = dataclasses.replace(
662
+ self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
663
+ )
664
+
512
665
  def write_schema(self) -> None:
513
666
  """Write the schema file to benchmark_output so the frontend knows about it."""
514
667
  # Manually add the model metadata to the schema.json, where the frontend expects it.
@@ -1070,7 +1223,8 @@ class Summarizer:
1070
1223
  is_scenario_table=False,
1071
1224
  aggregation_strategies=aggregate_strategies,
1072
1225
  )
1073
- tables.append(table)
1226
+ if len(table.header) > 1:
1227
+ tables.append(table)
1074
1228
  return tables
1075
1229
 
1076
1230
  def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
@@ -1213,14 +1367,16 @@ class Summarizer:
1213
1367
  """Run the entire summarization pipeline."""
1214
1368
  self.read_runs()
1215
1369
  self.group_runs()
1216
- self.check_metrics_defined()
1217
1370
 
1218
- self.write_run_display_json(skip_completed)
1371
+ ensure_directory_exists(self.run_release_path)
1219
1372
 
1220
1373
  # Must happen after self.read_runs()
1221
1374
  # because it uses self.runs
1375
+ self.fix_up_schema()
1376
+ self.check_metrics_defined()
1222
1377
  self.write_schema()
1223
1378
 
1379
+ self.write_run_display_json(skip_completed)
1224
1380
  self.write_executive_summary()
1225
1381
  self.write_runs()
1226
1382
  self.write_run_specs()
@@ -1254,7 +1410,15 @@ def summarize(args):
1254
1410
  else:
1255
1411
  raise ValueError("Exactly one of --release or --suite must be specified.")
1256
1412
 
1257
- schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1413
+ schema_path: Optional[str]
1414
+ if args.auto_generate_schema:
1415
+ if args.schema_path:
1416
+ raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
1417
+ schema_path = None
1418
+ elif args.schema_path:
1419
+ schema_path = args.schema_path
1420
+ else:
1421
+ schema_path = get_default_schema_path()
1258
1422
 
1259
1423
  register_builtin_configs_from_helm_package()
1260
1424
  register_configs_from_directory(args.local_path)
@@ -1346,6 +1510,11 @@ def main():
1346
1510
  default=None,
1347
1511
  help="PATH to a YAML file to customize logging",
1348
1512
  )
1513
+ parser.add_argument(
1514
+ "--auto-generate-schema",
1515
+ action="store_true",
1516
+ help="EXPERIMENTAL: Auto-generate schema",
1517
+ )
1349
1518
  args = parser.parse_args()
1350
1519
  setup_default_logging(args.log_config)
1351
1520
  summarize(args)