crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,29 @@
1
+ from dataclasses import dataclass
2
+ from random import Random
3
+
4
+ from .perturbation import TextPerturbation
5
+ from .perturbation_description import PerturbationDescription
6
+
7
+
8
+ class SuffixPerturbation(TextPerturbation):
9
+ """
10
+ Appends a suffix to the end of the text. Example:
11
+
12
+ A picture of a dog -> A picture of a dog, picasso
13
+ """
14
+
15
+ @dataclass(frozen=True)
16
+ class Description(PerturbationDescription):
17
+ suffix: str = ""
18
+
19
+ name: str = "style"
20
+
21
+ def __init__(self, suffix: str):
22
+ self._suffix: str = suffix
23
+
24
+ @property
25
+ def description(self) -> PerturbationDescription:
26
+ return SuffixPerturbation.Description(name=self.name, suffix=self._suffix)
27
+
28
+ def perturb(self, text: str, rng: Random) -> str:
29
+ return f"{text}, {self._suffix}"
@@ -2,6 +2,7 @@
2
2
  from typing import List
3
3
  import unittest
4
4
 
5
+ from helm.common.media_object import MediaObject, MultimediaObject
5
6
  from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference
6
7
  from .data_augmenter import DataAugmenter
7
8
  from .extra_space_perturbation import ExtraSpacePerturbation
@@ -15,6 +16,7 @@ from .space_perturbation import SpacePerturbation
15
16
  from .dialect_perturbation import DialectPerturbation
16
17
  from .person_name_perturbation import PersonNamePerturbation
17
18
  from .gender_perturbation import GenderPerturbation
19
+ from .suffix_perturbation import SuffixPerturbation
18
20
 
19
21
 
20
22
  def test_extra_space_perturbation():
@@ -32,6 +34,35 @@ def test_extra_space_perturbation():
32
34
  assert instances[1].references[0].output.text == "some name"
33
35
 
34
36
 
37
+ def test_multimodal_text_perturbation():
38
+ data_augmenter = DataAugmenter(perturbations=[ExtraSpacePerturbation(num_spaces=3)])
39
+ input: Input = Input(
40
+ multimedia_content=MultimediaObject(
41
+ [
42
+ MediaObject(text="Hello what is", content_type="text/plain"),
43
+ MediaObject(text="your name", content_type="text/plain"),
44
+ ]
45
+ )
46
+ )
47
+ instance: Instance = Instance(id="id0", input=input, references=[Reference(Output(text="some name"), tags=[])])
48
+ instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
49
+
50
+ assert len(instances) == 2
51
+
52
+ # Test that the first instance is unperturbed
53
+ assert instances[0].id == "id0"
54
+ assert instances[0].perturbation is None
55
+ media_objects = instances[0].input.multimedia_content.media_objects
56
+ assert media_objects[0].text == "Hello what is"
57
+ assert media_objects[1].text == "your name"
58
+
59
+ assert instances[1].id == "id0"
60
+ assert instances[1].perturbation.name == "extra_space"
61
+ media_objects = instances[1].input.multimedia_content.media_objects
62
+ assert media_objects[0].text == "Hello what is"
63
+ assert media_objects[1].text == "your name"
64
+
65
+
35
66
  def test_misspelling_perturbation():
36
67
  data_augmenter = DataAugmenter(perturbations=[MisspellingPerturbation(prob=1.0)])
37
68
  instance: Instance = Instance(
@@ -145,7 +176,6 @@ def test_space_perturbation():
145
176
  instance: Instance = Instance(id="id0", input=Input(text="Hello World!\nQuite a day, huh?"), references=[])
146
177
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
147
178
 
148
- print(instances)
149
179
  assert len(instances) == 2
150
180
  assert instances[1].perturbation.name == "space"
151
181
  assert instances[1].input.text == "Hello World!\nQuite a day, huh?"
@@ -162,7 +192,6 @@ def test_dialect_perturbation():
162
192
  )
163
193
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
164
194
 
165
- print(instances)
166
195
  assert len(instances) == 2
167
196
  assert instances[1].perturbation.name == "dialect"
168
197
  assert instances[1].input.text == "I gon remember dis day to b the best day of mah life."
@@ -188,7 +217,6 @@ def test_person_name_perturbation():
188
217
  )
189
218
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
190
219
 
191
- print(instances)
192
220
  assert len(instances) == 2
193
221
  assert instances[1].perturbation.name == "person_name"
194
222
  assert (
@@ -209,7 +237,6 @@ def test_gender_pronoun_perturbation():
209
237
  )
210
238
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
211
239
 
212
- print(instances)
213
240
  assert len(instances) == 2
214
241
  assert instances[1].perturbation.mode == "pronouns"
215
242
  assert instances[1].input.text == "Did she mention that she was coming with her parents and their friends?"
@@ -227,13 +254,22 @@ def test_gender_term_perturbation():
227
254
  )
228
255
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
229
256
 
230
- print(instances)
231
257
  assert len(instances) == 2
232
258
  assert instances[1].perturbation.mode == "terms"
233
259
  assert instances[1].input.text == "His granddaughters looked a lot like their mom."
234
260
  assert instances[1].references[0].output.text == "How did their mother look like?"
235
261
 
236
262
 
263
+ def test_suffix_perturbation():
264
+ data_augmenter = DataAugmenter(perturbations=[SuffixPerturbation(suffix="pixel art")])
265
+ instance: Instance = Instance(id="id0", input=Input(text="A blue dog"), references=[])
266
+ instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
267
+
268
+ assert len(instances) == 2
269
+ assert instances[1].perturbation.suffix == "pixel art"
270
+ assert instances[1].input.text == "A blue dog, pixel art"
271
+
272
+
237
273
  # TODO(#1958) Fix the logic to renable this test
238
274
  @unittest.skip("Currently cannot replace words at either text boundary.")
239
275
  def test_gender_term_perturbation_edge_word():
@@ -247,7 +283,6 @@ def test_gender_term_perturbation_edge_word():
247
283
  )
248
284
  instances: List[Instance] = data_augmenter.generate([instance], include_original=False)
249
285
 
250
- print(instances)
251
286
  assert len(instances) == 1
252
287
  assert instances[0].input.text == "mom said it is okay"
253
288
  assert instances[0].references[0].output.text == "Sure he did daughter"
@@ -266,6 +301,5 @@ def test_gender_term_perturbation_consequtive_words():
266
301
  )
267
302
  instances: List[Instance] = data_augmenter.generate([instance], include_original=False)
268
303
 
269
- print(instances)
270
304
  assert len(instances) == 1
271
305
  assert instances[0].input.text == "I'm a mom mom: my daughter has a daughter."
@@ -0,0 +1,30 @@
1
+ from dataclasses import dataclass
2
+ from random import Random
3
+
4
+ from helm.clients.google_translate_client import GoogleTranslateClient
5
+ from .perturbation import TextPerturbation
6
+ from .perturbation_description import PerturbationDescription
7
+
8
+
9
+ class TranslatePerturbation(TextPerturbation):
10
+ """
11
+ Translates to different languages.
12
+ """
13
+
14
+ @dataclass(frozen=True)
15
+ class Description(PerturbationDescription):
16
+ # Language code to translate to. Needs a default value since we inherit from `PerturbationDescription`
17
+ language_code: str = "zh-CN"
18
+
19
+ name: str = "translate"
20
+
21
+ def __init__(self, language_code: str):
22
+ self.language_code: str = language_code
23
+ self.google_translate_client = GoogleTranslateClient()
24
+
25
+ @property
26
+ def description(self) -> PerturbationDescription:
27
+ return TranslatePerturbation.Description(name=self.name, language_code=self.language_code)
28
+
29
+ def perturb(self, text: str, rng: Random) -> str:
30
+ return self.google_translate_client.translate(text, self.language_code)
@@ -4,16 +4,18 @@ import importlib_resources as resources
4
4
  from helm.benchmark.model_deployment_registry import register_model_deployments_from_path
5
5
  from helm.benchmark.model_metadata_registry import register_model_metadata_from_path
6
6
  from helm.benchmark.tokenizer_config_registry import register_tokenizer_configs_from_path
7
+ from helm.benchmark.runner_config_registry import register_runner_config_from_path
7
8
 
8
9
 
9
10
  MODEL_METADATA_FILE: str = "model_metadata.yaml"
10
11
  TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml"
11
12
  MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml"
13
+ RUNNER_CONFIG_FILE: str = "runner_config.yaml"
12
14
 
13
15
  CONFIG_PACKAGE = "helm.config"
14
16
 
15
17
 
16
- def register_configs_from_directory(dir_path) -> None:
18
+ def register_configs_from_directory(dir_path: str) -> None:
17
19
  model_metadata_path = os.path.join(dir_path, MODEL_METADATA_FILE)
18
20
  if os.path.isfile(model_metadata_path):
19
21
  register_model_metadata_from_path(model_metadata_path)
@@ -26,6 +28,10 @@ def register_configs_from_directory(dir_path) -> None:
26
28
  if os.path.isfile(model_deployments_path):
27
29
  register_model_deployments_from_path(model_deployments_path)
28
30
 
31
+ runner_config_path = os.path.join(dir_path, RUNNER_CONFIG_FILE)
32
+ if os.path.isfile(runner_config_path):
33
+ register_runner_config_from_path(runner_config_path)
34
+
29
35
 
30
36
  def register_builtin_configs_from_helm_package() -> None:
31
37
  package_path = str(resources.files(CONFIG_PACKAGE))
@@ -1,9 +1,15 @@
1
1
  from typing import Optional
2
2
  from dataclasses import dataclass, replace
3
+ from helm.common.cache_backend_config import (
4
+ CacheBackendConfig,
5
+ BlackHoleCacheBackendConfig,
6
+ MongoCacheBackendConfig,
7
+ SqliteCacheBackendConfig,
8
+ )
3
9
 
4
10
  from helm.common.general import parallel_map
5
11
  from helm.common.hierarchical_logger import htrack, hlog
6
- from helm.common.request import RequestResult, Sequence
12
+ from helm.common.request import RequestResult, GeneratedOutput
7
13
  from helm.common.authentication import Authentication
8
14
  from helm.proxy.services.remote_service import RemoteService
9
15
  from helm.proxy.services.server_service import ServerService
@@ -18,28 +24,36 @@ class ExecutorError(Exception):
18
24
 
19
25
  @dataclass(frozen=True)
20
26
  class ExecutionSpec:
21
- # If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959).
27
+
22
28
  url: Optional[str]
29
+ """If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959)."""
23
30
 
24
- # Pass into the service
25
31
  auth: Authentication
32
+ """Authentication that will be passed into the local service, if using the local service."""
26
33
 
27
- # Path where API credentials and cache is stored.
28
- # This path is the same as `--base-path` when launching the proxy server (see server.py).
29
- # Required when url is not set.
30
34
  local_path: Optional[str]
35
+ """Path where API credentials and cache is stored.
36
+
37
+ This path is the same as `--base-path` when launching the proxy server (see server.py).
38
+ Required when url is not set."""
31
39
 
32
- # How many threads to have at once
33
40
  parallelism: int
41
+ """How many threads to have at once"""
34
42
 
35
- # Whether to skip execution
36
43
  dry_run: bool = False
44
+ """Whether to skip execution"""
45
+
46
+ sqlite_cache_backend_config: Optional[SqliteCacheBackendConfig] = None
47
+ """If set, SQLite will be used for the cache.
48
+
49
+ This specifies the directory in which the SQLite cache will store files.
50
+ At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
37
51
 
38
- # URL to the MongoDB database.
39
- # If non-empty, the MongoDB database will be used for caching instead of SQLite.
40
- # Example format: mongodb://[username:password@]host1[:port1]/[dbname]
41
- # For full format, see: https://www.mongodb.com/docs/manual/reference/connection-string/
42
- mongo_uri: str = ""
52
+ mongo_cache_backend_config: Optional[MongoCacheBackendConfig] = None
53
+ """If set, MongoDB will be used for the cache.
54
+
55
+ This specifies the MongoDB database to be used by the MongoDB cache.
56
+ At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
43
57
 
44
58
 
45
59
  class Executor:
@@ -51,6 +65,16 @@ class Executor:
51
65
  def __init__(self, execution_spec: ExecutionSpec):
52
66
  self.execution_spec = execution_spec
53
67
 
68
+ cache_backend_config: CacheBackendConfig
69
+ if execution_spec.sqlite_cache_backend_config and execution_spec.mongo_cache_backend_config:
70
+ raise ExecutorError("At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set.")
71
+ elif execution_spec.sqlite_cache_backend_config:
72
+ cache_backend_config = execution_spec.sqlite_cache_backend_config
73
+ elif execution_spec.mongo_cache_backend_config:
74
+ cache_backend_config = execution_spec.mongo_cache_backend_config
75
+ else:
76
+ cache_backend_config = BlackHoleCacheBackendConfig()
77
+
54
78
  self.service: Service
55
79
  if execution_spec.url:
56
80
  hlog(f"Running using remote API proxy server: {execution_spec.url}")
@@ -58,7 +82,9 @@ class Executor:
58
82
  elif execution_spec.local_path:
59
83
  hlog(f"Running in local mode with base path: {execution_spec.local_path}")
60
84
  self.service = ServerService(
61
- base_path=execution_spec.local_path, root_mode=True, mongo_uri=execution_spec.mongo_uri
85
+ base_path=execution_spec.local_path,
86
+ root_mode=True,
87
+ cache_backend_config=cache_backend_config,
62
88
  )
63
89
  else:
64
90
  raise ValueError("Either the proxy server URL or the local path must be set")
@@ -77,7 +103,11 @@ class Executor:
77
103
  )
78
104
 
79
105
  hlog(f"Processed {len(request_states)} requests")
80
- return ScenarioState(scenario_state.adapter_spec, request_states)
106
+ return ScenarioState(
107
+ adapter_spec=scenario_state.adapter_spec,
108
+ request_states=request_states,
109
+ annotator_specs=scenario_state.annotator_specs,
110
+ )
81
111
 
82
112
  def process(self, state: RequestState) -> RequestState:
83
113
  try:
@@ -87,7 +117,7 @@ class Executor:
87
117
  if not result.success:
88
118
  if result.error_flags and not result.error_flags.is_fatal:
89
119
  hlog(f"WARNING: Non-fatal error treated as empty completion: {result.error}")
90
- result.completions = [Sequence(text="", logprob=0, tokens=[])]
120
+ result.completions = [GeneratedOutput(text="", logprob=0, tokens=[])]
91
121
  else:
92
122
  raise ExecutorError(f"{str(result.error)} Request: {state.request}")
93
123
  return replace(state, result=result)
@@ -4,7 +4,6 @@ from typing import Optional
4
4
  from helm.benchmark.model_deployment_registry import (
5
5
  ClientSpec,
6
6
  ModelDeployment,
7
- WindowServiceSpec,
8
7
  register_model_deployment,
9
8
  )
10
9
  from helm.benchmark.model_metadata_registry import (
@@ -14,6 +13,7 @@ from helm.benchmark.model_metadata_registry import (
14
13
  )
15
14
  from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec, register_tokenizer_config
16
15
  from helm.common.hierarchical_logger import hlog
16
+ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
17
17
 
18
18
 
19
19
  def register_huggingface_model(
@@ -23,18 +23,29 @@ def register_huggingface_model(
23
23
  if revision:
24
24
  object_spec_args["revision"] = revision
25
25
 
26
+ # Auto-infer model properties from the tokenizer.
27
+ with HuggingFaceTokenizer.create_tokenizer(**object_spec_args) as tokenizer:
28
+ max_sequence_length = tokenizer.model_max_length
29
+ end_of_text_token = tokenizer.eos_token or ""
30
+ prefix_token = tokenizer.bos_token or ""
31
+ # If the tokenizer config has a model_max_length of 1000000000000000019884624838656
32
+ # it means that model creator did not specify model_max_length.
33
+ if max_sequence_length > 1_000_000:
34
+ raise ValueError(
35
+ f"Could not infer the model_max_length of Hugging Face model {pretrained_model_name_or_path}, so "
36
+ f"--enable-huggingface-models and --enable-local-huggingface-models cannot be used for this model. "
37
+ f"Please configure the model using prod_env/model_deployments.yaml instead."
38
+ )
39
+
26
40
  model_deployment = ModelDeployment(
27
41
  name=helm_model_name,
28
42
  client_spec=ClientSpec(
29
- class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient",
43
+ class_name="helm.clients.huggingface_client.HuggingFaceClient",
30
44
  args=object_spec_args,
31
45
  ),
32
46
  model_name=helm_model_name,
33
47
  tokenizer_name=helm_model_name,
34
- window_service_spec=WindowServiceSpec(
35
- class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService",
36
- args=object_spec_args,
37
- ),
48
+ max_sequence_length=max_sequence_length,
38
49
  )
39
50
 
40
51
  # We check if the model is already registered because we don't want to
@@ -51,9 +62,11 @@ def register_huggingface_model(
51
62
  tokenizer_config = TokenizerConfig(
52
63
  name=helm_model_name,
53
64
  tokenizer_spec=TokenizerSpec(
54
- class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer",
65
+ class_name="helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer",
55
66
  args=object_spec_args,
56
67
  ),
68
+ end_of_text_token=end_of_text_token,
69
+ prefix_token=prefix_token,
57
70
  )
58
71
  register_tokenizer_config(tokenizer_config)
59
72