crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (311) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/classification_metrics.py +19 -1
  27. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  28. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  29. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  30. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  31. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  32. helm/benchmark/metrics/comet_metric.py +1 -1
  33. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  34. helm/benchmark/metrics/copyright_metrics.py +1 -1
  35. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  36. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  37. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  38. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  39. helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
  40. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  41. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  42. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  43. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  44. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  45. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  46. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  47. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  48. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  49. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  50. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  51. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  52. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  53. helm/benchmark/metrics/medec_metrics.py +25 -2
  54. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  55. helm/benchmark/metrics/metric.py +25 -0
  56. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  57. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  58. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  59. helm/benchmark/metrics/summac/model_summac.py +3 -3
  60. helm/benchmark/metrics/summarization_metrics.py +129 -1
  61. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  62. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  63. helm/benchmark/model_deployment_registry.py +11 -19
  64. helm/benchmark/presentation/create_plots.py +11 -2
  65. helm/benchmark/presentation/schema.py +10 -22
  66. helm/benchmark/presentation/summarize.py +189 -14
  67. helm/benchmark/presentation/taxonomy_info.py +20 -0
  68. helm/benchmark/presentation/test_create_plots.py +4 -1
  69. helm/benchmark/run.py +7 -1
  70. helm/benchmark/run_expander.py +4 -0
  71. helm/benchmark/run_specs/arabic_run_specs.py +191 -0
  72. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  73. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  74. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  75. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  76. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  77. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  78. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  79. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  80. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  81. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  82. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  83. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  84. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  85. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  86. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  87. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  88. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  89. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  90. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  91. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  92. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  93. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  94. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  95. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  96. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  97. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  98. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  99. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  100. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  101. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  102. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  103. helm/benchmark/scenarios/bold_scenario.py +15 -0
  104. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  105. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  106. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  107. helm/benchmark/scenarios/clear_scenario.py +23 -0
  108. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  109. helm/benchmark/scenarios/code_scenario.py +28 -0
  110. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  111. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  112. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  113. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  114. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  115. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  116. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  117. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  118. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  119. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  120. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  121. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  122. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  123. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  124. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  125. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  126. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  127. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  128. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  129. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  130. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  131. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  132. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  133. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  134. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  135. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  136. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  137. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  138. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  139. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  140. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  141. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  142. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  143. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  144. helm/benchmark/scenarios/ice_scenario.py +21 -1
  145. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  146. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  147. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  148. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  149. helm/benchmark/scenarios/koala_scenario.py +21 -1
  150. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  151. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  152. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  153. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  154. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  155. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  156. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  157. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  158. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  159. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  160. helm/benchmark/scenarios/math_scenario.py +47 -20
  161. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  162. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  163. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  164. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  165. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  166. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  167. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  168. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  169. helm/benchmark/scenarios/medec_scenario.py +23 -0
  170. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  171. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  172. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  173. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  174. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  175. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  176. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  177. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  178. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  179. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  180. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  181. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  182. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  183. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  184. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  185. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  186. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  187. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  188. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  189. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  190. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  191. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  192. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  193. helm/benchmark/scenarios/quac_scenario.py +14 -0
  194. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  195. helm/benchmark/scenarios/raft_scenario.py +15 -0
  196. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  197. helm/benchmark/scenarios/scenario.py +31 -0
  198. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  199. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  200. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  201. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  202. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  203. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  204. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  205. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  206. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  207. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  208. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  209. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  210. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  211. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  212. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  213. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  214. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  215. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  216. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  217. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  218. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  219. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  220. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  221. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  222. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  223. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  224. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  225. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  226. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  227. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  228. helm/benchmark/slurm_jobs.py +1 -2
  229. helm/benchmark/slurm_runner.py +8 -1
  230. helm/benchmark/static/schema_arabic.yaml +271 -0
  231. helm/benchmark/static/schema_classic.yaml +0 -17
  232. helm/benchmark/static/schema_long_context.yaml +24 -6
  233. helm/benchmark/static/schema_medhelm.yaml +36 -0
  234. helm/benchmark/static/schema_slp.yaml +219 -0
  235. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  236. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  237. helm/benchmark/static_build/index.html +2 -2
  238. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  239. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  240. helm/clients/audio_language/llama_omni/constants.py +9 -0
  241. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  242. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  243. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  244. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  245. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  246. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  247. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  248. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  249. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  250. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  251. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  252. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  253. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  254. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  255. helm/clients/audio_language/llama_omni/utils.py +202 -0
  256. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  257. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  258. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  259. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  260. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  261. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  262. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  263. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  264. helm/clients/huggingface_client.py +2 -2
  265. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  266. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  267. helm/clients/openai_client.py +33 -20
  268. helm/clients/openai_responses_client.py +34 -8
  269. helm/clients/openrouter_client.py +31 -0
  270. helm/clients/test_huggingface_client.py +3 -3
  271. helm/clients/test_openrouter_client.py +69 -0
  272. helm/clients/together_client.py +48 -13
  273. helm/clients/vertexai_client.py +19 -11
  274. helm/clients/vllm_client.py +43 -7
  275. helm/clients/vllm_granite_thinking_client.py +56 -0
  276. helm/common/critique_request.py +0 -1
  277. helm/common/hierarchical_logger.py +83 -34
  278. helm/common/object_spec.py +23 -8
  279. helm/common/test_logging.py +94 -0
  280. helm/config/model_deployments.yaml +525 -172
  281. helm/config/model_metadata.yaml +185 -10
  282. helm/config/tokenizer_configs.yaml +100 -2
  283. helm/proxy/cli.py +1 -1
  284. helm/proxy/example_queries.py +8 -8
  285. helm/proxy/retry.py +5 -0
  286. helm/proxy/server.py +2 -1
  287. helm/proxy/static/index.css +4 -0
  288. helm/proxy/static/index.js +7 -1
  289. helm/tokenizers/grok_tokenizer.py +2 -0
  290. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  291. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  292. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  293. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  294. helm/benchmark/metrics/medalign_metrics.py +0 -14
  295. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  296. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  297. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  298. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  299. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  300. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  301. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  302. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  303. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  304. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  305. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  306. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  307. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  308. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  309. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  310. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  311. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,69 @@
1
+ import os
2
+ import pytest
3
+ import tempfile
4
+
5
+ from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
6
+ from helm.common.request import Request
7
+ from helm.clients.openrouter_client import OpenRouterClient
8
+
9
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
10
+
11
+
12
+ class TestOpenRouterClient:
13
+ def setup_method(self, method):
14
+ cache_file = tempfile.NamedTemporaryFile(delete=False)
15
+ self.cache_path: str = cache_file.name
16
+ self.tokenizer_name = "mistralai/Mistral-7B-v0.1"
17
+ self.tokenizer = HuggingFaceTokenizer(
18
+ cache_config=BlackHoleCacheConfig(),
19
+ tokenizer_name=self.tokenizer_name,
20
+ )
21
+
22
+ def teardown_method(self, method):
23
+ os.remove(self.cache_path)
24
+
25
+ @pytest.mark.parametrize(
26
+ "model_name,test_input,expected_model",
27
+ [
28
+ (
29
+ "mistralai/mistral-medium-3.1",
30
+ Request(
31
+ model="mistralai/mistral-medium-3.1",
32
+ model_deployment="openrouter/mistral-medium-3.1",
33
+ ),
34
+ "mistralai/mistral-medium-3.1",
35
+ ),
36
+ (
37
+ None,
38
+ Request(model="openai/gpt-oss-20b:free", model_deployment="openrouter/gpt-oss-20b:free"),
39
+ "openai/gpt-oss-20b:free",
40
+ ),
41
+ ],
42
+ )
43
+ def test_get_model_for_request(self, model_name, test_input, expected_model):
44
+ client = OpenRouterClient(
45
+ tokenizer_name=self.tokenizer_name,
46
+ tokenizer=self.tokenizer,
47
+ cache_config=SqliteCacheConfig(self.cache_path),
48
+ model_name=model_name,
49
+ api_key="test_key",
50
+ )
51
+ assert client._get_model_for_request(test_input) == expected_model
52
+
53
+ def test_api_key_env_var(self, monkeypatch):
54
+ monkeypatch.setenv("OPENROUTER_API_KEY", "test_key")
55
+ client = OpenRouterClient(
56
+ tokenizer_name=self.tokenizer_name,
57
+ tokenizer=self.tokenizer,
58
+ cache_config=SqliteCacheConfig(self.cache_path),
59
+ )
60
+ assert client.api_key == "test_key"
61
+
62
+ def test_api_key_argument(self):
63
+ client = OpenRouterClient(
64
+ tokenizer_name=self.tokenizer_name,
65
+ tokenizer=self.tokenizer,
66
+ cache_config=BlackHoleCacheConfig(),
67
+ api_key="explicit_key",
68
+ )
69
+ assert client.api_key == "explicit_key"
@@ -25,8 +25,6 @@ except ModuleNotFoundError as e:
25
25
  class _RewriteRequestTags:
26
26
  """Tags that indicate that the request for the model must be rewritten before sending to Together."""
27
27
 
28
- # TODO: Convert to StrEnum after upgrading to Python 3.11
29
-
30
28
  ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
31
29
  """Indicates that the EOS token should be added as an extra stop sequence.
32
30
 
@@ -101,7 +99,20 @@ class JobNotFinishedError(TogetherClientError):
101
99
  pass
102
100
 
103
101
 
104
- def _parse_thinking(input: str) -> Tuple[str, str]:
102
+ def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
103
+ """Return a tuple of thinking text and output text."""
104
+ match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
105
+ if match:
106
+ return (match.group(1), match.group(2))
107
+
108
+ match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
109
+ if match:
110
+ return (match.group(1), "")
111
+
112
+ return (input, "")
113
+
114
+
115
+ def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
105
116
  """Return a tuple of thinking text and output text."""
106
117
  match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
107
118
  if match:
@@ -114,6 +125,31 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
114
125
  return (input, "")
115
126
 
116
127
 
128
+ def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
129
+ """Return a tuple of thinking text and output text."""
130
+ match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
131
+ if match:
132
+ return (match.group(1), match.group(2))
133
+
134
+ match = re.match(r"\n<think>(.*)", input, re.DOTALL)
135
+ if match:
136
+ return (match.group(1), "")
137
+
138
+ return (input, "")
139
+
140
+
141
+ def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
142
+ # TODO: Come up with a more sustainable extensible way of doing this.
143
+ if "deepseek-r1" in model_name:
144
+ return _parse_thinking_deepseek_r1(input)
145
+ elif "qwen3" in model_name:
146
+ return _parse_thinking_qwen3(input)
147
+ elif "glm-4.5" in model_name:
148
+ return _parse_thinking_glm_4_5(input)
149
+ else:
150
+ raise Exception(f"No thinking parser available for model {model_name}")
151
+
152
+
117
153
  class TogetherClient(CachingClient):
118
154
  """
119
155
  Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -348,9 +384,8 @@ class TogetherChatClient(CachingClient):
348
384
  self._client = Together(api_key=api_key)
349
385
  self._together_model = together_model
350
386
  self._disable_logprobs = bool(disable_logprobs)
351
- # self.output_processor is actually a function, not a class
352
387
  self._parse_thinking = bool(parse_thinking)
353
-
388
+ # self.output_processor is actually a function, not a class
354
389
  self.output_processor: Optional[Callable[[str], str]] = (
355
390
  get_class_by_name(output_processor) if output_processor else None
356
391
  )
@@ -446,15 +481,15 @@ class TogetherChatClient(CachingClient):
446
481
  if self.output_processor:
447
482
  output_text = self.output_processor(output_text)
448
483
 
484
+ thinking: Optional[Thinking] = None
449
485
  if self._parse_thinking:
450
- thinking_text, output_text = _parse_thinking(output_text)
451
- generated_outputs.append(
452
- GeneratedOutput(
453
- text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
454
- )
455
- )
456
- else:
457
- generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
486
+ thinking_text, output_text = _parse_thinking(output_text, request.model)
487
+ thinking = Thinking(text=thinking_text)
488
+ elif hasattr(choice.message, "reasoning_content"):
489
+ thinking = Thinking(text=choice.message.reasoning_content)
490
+ generated_outputs.append(
491
+ GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
492
+ )
458
493
  return RequestResult(
459
494
  success=True,
460
495
  cached=cached,
@@ -1,7 +1,7 @@
1
1
  import requests
2
2
  from abc import ABC, abstractmethod
3
3
  from threading import Lock
4
- from typing import Any, Dict, Mapping, Optional, List, Union
4
+ from typing import Any, Dict, Mapping, Optional, List, Union, cast
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.multimodal_request_utils import get_contents_as_bytes
@@ -107,7 +107,7 @@ class VertexAITextClient(VertexAIClient):
107
107
 
108
108
  def make_request(self, request: Request) -> RequestResult:
109
109
  """Make a request"""
110
- parameters = {
110
+ parameters: Dict[str, Any] = {
111
111
  "temperature": request.temperature,
112
112
  "max_output_tokens": request.max_tokens,
113
113
  "top_k": request.top_k_per_token,
@@ -207,21 +207,23 @@ class VertexAIChatClient(VertexAIClient):
207
207
 
208
208
  def make_request(self, request: Request) -> RequestResult:
209
209
  """Make a request"""
210
- contents = [request.prompt]
210
+ # mypy is unhappy without this cast
211
+ contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
212
+ List[Union[str, Image, Part]], [request.prompt]
213
+ )
211
214
 
212
215
  # For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
213
216
  if request.multimodal_prompt is not None:
214
217
  return self._make_multimodal_request(request)
215
218
 
216
219
  if request.messages is not None:
217
- contents = []
218
220
  role_mapping = {"user": "user", "assistant": "model"}
219
- for msg in request.messages:
220
- contents.append(
221
- Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
222
- )
221
+ contents = [
222
+ Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
223
+ for msg in request.messages
224
+ ]
223
225
 
224
- parameters = {
226
+ parameters: Dict[str, Any] = {
225
227
  "temperature": request.temperature,
226
228
  "max_output_tokens": request.max_tokens,
227
229
  "top_k": request.top_k_per_token,
@@ -274,8 +276,14 @@ class VertexAIChatClient(VertexAIClient):
274
276
  if not candidate.content:
275
277
  raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
276
278
  if not candidate.content.parts:
277
- raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
278
- predictions.append({"text": candidate.content.text})
279
+ if candidate.finish_reason == 2: # MAX_TOKENS
280
+ # This means that there is no text output because the maximum number of tokens were
281
+ # reached during thinking.
282
+ predictions.append({"text": ""})
283
+ else:
284
+ raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
285
+ else:
286
+ predictions.append({"text": candidate.content.text})
279
287
  # TODO: Extract more information from the response
280
288
  return {"predictions": predictions}
281
289
 
@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
2
2
 
3
3
  from helm.common.cache import CacheConfig
4
4
  from helm.common.request import Request
5
- from helm.clients.openai_client import OpenAILegacyCompletionsClient
5
+ from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
6
6
  from helm.tokenizers.tokenizer import Tokenizer
7
7
 
8
8
 
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
19
19
  tokenizer_name: str,
20
20
  cache_config: CacheConfig,
21
21
  base_url: Optional[str] = None,
22
+ vllm_model_name: Optional[str] = None,
23
+ **kwargs,
22
24
  ):
23
25
  super().__init__(
24
26
  tokenizer=tokenizer,
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
27
29
  api_key="EMPTY",
28
30
  org_id=None,
29
31
  base_url=base_url,
32
+ openai_model_name=vllm_model_name,
33
+ **kwargs,
30
34
  )
31
35
  self.tokenizer = tokenizer
32
36
  self.tokenizer_name = tokenizer_name
33
-
34
- def _get_model_for_request(self, request: Request) -> str:
35
- # The `model` parameter for vLLM should be the whole model name including the creator organization,
36
- # unlike OpenAI which only uses the model engine.
37
- return request.model
37
+ self.vllm_model_name = vllm_model_name
38
38
 
39
39
  def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
40
40
  raw_request = super()._to_raw_completion_request(request)
41
41
  # This avoids the error: best_of must be 1 when using greedy sampling
42
- if "best_of" in raw_request and raw_request["best_of"] > 1:
42
+ if (
43
+ "temperature" in raw_request
44
+ and raw_request["temperature"] == 0.0
45
+ and "best_of" in raw_request
46
+ and raw_request["best_of"] > 1
47
+ ):
43
48
  raw_request["best_of"] = 1
44
49
  return raw_request
50
+
51
+
52
+ class VLLMChatClient(OpenAIClient):
53
+ """Sends request to a vLLM server using the OpenAI-compatible API.
54
+
55
+ Only uses the Chat Completions API.
56
+
57
+ See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
58
+
59
+ def __init__(
60
+ self,
61
+ tokenizer: Tokenizer,
62
+ tokenizer_name: str,
63
+ cache_config: CacheConfig,
64
+ base_url: Optional[str] = None,
65
+ vllm_model_name: Optional[str] = None,
66
+ **kwargs,
67
+ ):
68
+ super().__init__(
69
+ tokenizer=tokenizer,
70
+ tokenizer_name=tokenizer_name,
71
+ cache_config=cache_config,
72
+ api_key="EMPTY",
73
+ org_id=None,
74
+ base_url=base_url,
75
+ openai_model_name=vllm_model_name,
76
+ **kwargs,
77
+ )
78
+ self.tokenizer = tokenizer
79
+ self.tokenizer_name = tokenizer_name
80
+ self.vllm_model_name = vllm_model_name
@@ -0,0 +1,56 @@
1
+ from dataclasses import replace
2
+ import re
3
+ from typing import Any, Dict, List, Tuple
4
+
5
+ from helm.clients.vllm_client import VLLMChatClient
6
+ from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
7
+
8
+
9
+ class VLLMGraniteThinkingClient(VLLMChatClient):
10
+ """Sends request to a Granite model on vLLM server with thinking enabled.
11
+
12
+ From vLLM documentation at
13
+ https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
14
+
15
+ IBM Granite 3.2 reasoning is disabled by default;
16
+ to enable it, you must also pass thinking=True in your chat_template_kwargs.
17
+ """
18
+
19
+ def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
20
+ raw_request = super()._make_chat_raw_request(request)
21
+ raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
22
+ return raw_request
23
+
24
+ def _parse_thinking(self, input: str) -> Tuple[str, str]:
25
+ """Return a tuple of thinking text and output text."""
26
+ match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
27
+ if match:
28
+ return (match.group(1), match.group(2))
29
+
30
+ match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
31
+ if match:
32
+ return (match.group(1), match.group(2))
33
+
34
+ match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
35
+ if match:
36
+ return (match.group(1), "")
37
+
38
+ match = re.match(r"<think>(.*)", input, re.DOTALL)
39
+ if match:
40
+ return (match.group(1), "")
41
+
42
+ return (input, "")
43
+
44
+ def _make_chat_request(self, request: Request) -> RequestResult:
45
+ request_result = super()._make_chat_request(request)
46
+ modified_completions: List[GeneratedOutput] = []
47
+ for completion in request_result.completions:
48
+ thinking, modified_text = self._parse_thinking(completion.text)
49
+ modified_completions.append(
50
+ replace(
51
+ completion,
52
+ text=modified_text,
53
+ thinking=Thinking(text=thinking),
54
+ )
55
+ )
56
+ return replace(request_result, completions=modified_completions)
@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
6
6
  class QuestionType:
7
7
  """String enum of question types."""
8
8
 
9
- # TODO: Make this a StrEnum after upgrading to Python 3.11
10
9
  MULTIPLE_CHOICE: str = "multiple_choice"
11
10
  CHECKBOX: str = "checkbox"
12
11
  FREE_RESPONSE: str = "free_response"
@@ -1,4 +1,7 @@
1
1
  import logging
2
+ import logging.config
3
+ import yaml
4
+ import os
2
5
  import sys
3
6
  import time
4
7
  from typing import Any, Callable, List, Optional
@@ -34,22 +37,31 @@ class HierarchicalLogger(object):
34
37
  def indent(self) -> str:
35
38
  return " " * len(self.start_times)
36
39
 
37
- def track_begin(self, x: Any) -> None:
38
- self.logger.info(self.indent() + str(x) + " {")
40
+ def track_begin(self, x: Any, **kwargs) -> None:
41
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
42
+ self.logger.info(self.indent() + str(x) + " {", **kwargs)
39
43
  sys.stdout.flush()
40
44
  self.start_times.append(time.time())
41
45
 
42
- def track_end(self) -> None:
46
+ def track_end(self, **kwargs) -> None:
47
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
43
48
  t = time.time() - self.start_times.pop()
44
- self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
49
+ self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
45
50
  sys.stdout.flush()
46
51
 
47
- def log(self, x: Any) -> None:
48
- self.logger.info(self.indent() + str(x))
52
+ def log(self, x: Any, **kwargs) -> None:
53
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
54
+ self.logger.info(self.indent() + str(x), **kwargs)
49
55
  sys.stdout.flush()
50
56
 
51
- def warn(self, x: Any) -> None:
52
- self.logger.warning(self.indent() + str(x))
57
+ def debug(self, x: Any, **kwargs) -> None:
58
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
59
+ self.logger.debug(self.indent() + str(x), **kwargs)
60
+ sys.stdout.flush()
61
+
62
+ def warn(self, x: Any, **kwargs) -> None:
63
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
64
+ self.logger.warning(self.indent() + str(x), **kwargs)
53
65
  sys.stdout.flush()
54
66
 
55
67
 
@@ -69,23 +81,31 @@ singleton = HierarchicalLogger()
69
81
  # Exposed public methods
70
82
 
71
83
 
72
- def hlog(x: Any) -> None:
73
- singleton.log(x)
84
+ def hdebug(x: Any, **kwargs) -> None:
85
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
86
+ singleton.debug(x, **kwargs)
87
+
88
+
89
+ def hlog(x: Any, **kwargs) -> None:
90
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
91
+ singleton.log(x, **kwargs)
74
92
 
75
93
 
76
- def hwarn(x: Any) -> None:
77
- singleton.warn(x)
94
+ def hwarn(x: Any, **kwargs) -> None:
95
+ kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
96
+ singleton.warn(x, **kwargs)
78
97
 
79
98
 
80
99
  class htrack_block:
81
- def __init__(self, x: Any) -> None:
100
+ def __init__(self, x: Any, stacklevel=1) -> None:
101
+ self._stacklevel = stacklevel + 1
82
102
  self.x = x
83
103
 
84
104
  def __enter__(self) -> None:
85
- singleton.track_begin(self.x)
105
+ singleton.track_begin(self.x, stacklevel=self._stacklevel)
86
106
 
87
107
  def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
88
- singleton.track_end()
108
+ singleton.track_end(stacklevel=self._stacklevel)
89
109
 
90
110
 
91
111
  class htrack:
@@ -116,34 +136,63 @@ class htrack:
116
136
  description = description.replace("$" + k, str(v))
117
137
  else:
118
138
  description = ""
119
- with htrack_block(parent + fn.__name__ + description):
139
+ with htrack_block(parent + fn.__name__ + description, stacklevel=2):
120
140
  return fn(*args, **kwargs)
121
141
 
122
142
  return wrapper
123
143
 
124
144
 
125
- def setup_default_logging():
145
+ def setup_default_logging(config_path: Optional[str] = None):
126
146
  """
127
- Setup a default logger to STDOUT for HELM via Python logging
128
- """
129
- formatter = ColoredFormatter(
130
- "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
131
- datefmt="%Y-%m-%dT%H:%M:%S",
132
- reset=True,
133
- log_colors={
134
- "DEBUG": "cyan",
135
- "INFO": "green",
136
- "WARNING": "yellow",
137
- "ERROR": "red",
138
- "CRITICAL": "red,bg_white",
139
- },
140
- secondary_log_colors={},
141
- style="%",
142
- )
147
+ Setup Python logging for HELM
143
148
 
149
+ Priority:
150
+ 1. External config file (YAML or JSON).
151
+ 2. ENV var LOG_LEVEL.
152
+ 3. a default logger to STDOUT
153
+ """
144
154
  logger = logging.getLogger("helm")
145
- logger.setLevel(logging.INFO)
146
155
  logger.propagate = False
156
+
157
+ if config_path and os.path.exists(config_path):
158
+ with open(config_path, "r") as f:
159
+ config = yaml.safe_load(f)
160
+ logging.config.dictConfig(config)
161
+ hdebug("setup custom HELM logging")
162
+ return
163
+
164
+ log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
165
+ try:
166
+ logger.setLevel(getattr(logging, log_level))
167
+ except AttributeError:
168
+ logger.setLevel(logging.INFO)
169
+
170
+ # Set formatter
171
+ formatter: Optional[logging.Formatter] = None
172
+ if sys.stdout.isatty():
173
+ try:
174
+ formatter = ColoredFormatter(
175
+ "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
176
+ datefmt="%Y-%m-%dT%H:%M:%S",
177
+ reset=True,
178
+ log_colors={
179
+ "DEBUG": "cyan",
180
+ "INFO": "green",
181
+ "WARNING": "yellow",
182
+ "ERROR": "red",
183
+ "CRITICAL": "red,bg_white",
184
+ },
185
+ style="%",
186
+ )
187
+ except ImportError:
188
+ pass
189
+
190
+ if formatter is None:
191
+ # fallback
192
+ formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
193
+
194
+ # Add default stdout handler
147
195
  handler = logging.StreamHandler(sys.stdout)
148
196
  handler.setFormatter(formatter)
149
197
  logger.addHandler(handler)
198
+ hdebug("setup default HELM logging")
@@ -55,14 +55,23 @@ def inject_object_spec_args(
55
55
  This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
56
56
 
57
57
  Example:
58
-
59
- class MyClass:
60
- def __init__(a: int, b: int, c: int, d: int = 0):
61
- pass
62
-
63
- old_object_spec = ObjectSpec(class_name="MyClass", args={"a": 11})
64
- new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
65
- # new_object_spec is now ObjectSpec(class_name="MyClass", args={"a": 11, "b": 12, "c": 13})
58
+ >>> from helm.common.object_spec import * # NOQA
59
+ >>> import sys, types
60
+ >>> # Given a custom class with hashable arguments
61
+ >>> class MyClass:
62
+ ... def __init__(a: int, b: int, c: int, d: int = 0):
63
+ ... pass
64
+ >>> #
65
+ >>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
66
+ >>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
67
+ >>> # </boilerplate>
68
+ >>> #
69
+ >>> # Define new style and old style object specs
70
+ >>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
71
+ >>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
72
+ >>> # new_object_spec is now
73
+ >>> print(new_object_spec)
74
+ ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
66
75
  """
67
76
  cls = get_class_by_name(spec.class_name)
68
77
  init_signature = inspect.signature(cls.__init__)
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
93
102
  <class_name>:<key>=<value>,<key>=<value>
94
103
  Usually, the description is something that's succinct and can be typed on the command-line.
95
104
  Here, value defaults to string.
105
+
106
+ Example:
107
+ >>> from helm.common.object_spec import * # NOQA
108
+ >>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
109
+ >>> parse_object_spec(description)
110
+ ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
96
111
  """
97
112
 
98
113
  def parse_arg(arg: str) -> Tuple[str, Any]: