crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,376 @@
1
+ from typing import List, Optional
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import (
4
+ ADAPT_GENERATION,
5
+ ADAPT_LANGUAGE_MODELING,
6
+ ADAPT_MULTIPLE_CHOICE_JOINT,
7
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
8
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
9
+ ADAPT_RANKING_BINARY,
10
+ AdapterSpec,
11
+ )
12
+
13
+
14
+ def format_instructions(instructions: str) -> str:
15
+ if len(instructions) > 0:
16
+ instructions += "\n"
17
+ return instructions
18
+
19
+
20
+ def get_multiple_choice_joint_adapter_spec(
21
+ instructions: str,
22
+ input_noun: Optional[str],
23
+ output_noun: str,
24
+ num_outputs: int = 5,
25
+ max_train_instances: int = 5,
26
+ max_tokens: int = 5,
27
+ sample_train: bool = True,
28
+ **kwargs,
29
+ ) -> AdapterSpec:
30
+ """
31
+ [instructions]
32
+
33
+ [input_noun]: [input]
34
+ [reference_1]
35
+ ...
36
+ [reference_k]
37
+ [output_noun]: [output]
38
+
39
+ [input_noun]: [input]
40
+ [reference_1]
41
+ ...
42
+ [reference_k]
43
+ [output_noun]:
44
+ """
45
+
46
+ return AdapterSpec(
47
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
48
+ instructions=format_instructions(instructions),
49
+ input_prefix=f"{input_noun}: " if input_noun is not None else "",
50
+ input_suffix="\n" if input_noun is not None else "",
51
+ output_prefix=f"{output_noun}: ",
52
+ output_suffix="\n",
53
+ max_train_instances=max_train_instances,
54
+ num_outputs=num_outputs,
55
+ max_tokens=max_tokens,
56
+ temperature=0.0,
57
+ stop_sequences=["\n"],
58
+ sample_train=sample_train,
59
+ **kwargs,
60
+ )
61
+
62
+
63
+ def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec:
64
+ """
65
+ [input] [reference_i]
66
+ or
67
+ [reference_i]
68
+ """
69
+ assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
70
+
71
+ return AdapterSpec(
72
+ method=method,
73
+ instructions="",
74
+ input_prefix="",
75
+ input_suffix="",
76
+ output_prefix=" " if not empty_input else "",
77
+ output_suffix="",
78
+ # Separate is basically language modeling, so can't easily use in-context examples
79
+ max_train_instances=0,
80
+ num_outputs=1,
81
+ max_tokens=0,
82
+ temperature=0.0,
83
+ )
84
+
85
+
86
+ def get_multiple_choice_adapter_spec(
87
+ method: str,
88
+ instructions: str,
89
+ input_noun: Optional[str],
90
+ output_noun: str,
91
+ max_train_instances: int = 5,
92
+ num_outputs: int = 5,
93
+ max_tokens: int = 1,
94
+ empty_input: bool = False,
95
+ sample_train: bool = True,
96
+ **kwargs,
97
+ ):
98
+ """
99
+ Toggle between joint and separate adapters.
100
+ """
101
+ if method == ADAPT_MULTIPLE_CHOICE_JOINT:
102
+ return get_multiple_choice_joint_adapter_spec(
103
+ instructions,
104
+ input_noun,
105
+ output_noun,
106
+ max_train_instances=max_train_instances,
107
+ num_outputs=num_outputs,
108
+ max_tokens=max_tokens,
109
+ sample_train=sample_train,
110
+ **kwargs,
111
+ )
112
+ elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
113
+ return get_multiple_choice_separate_adapter_spec(method, empty_input)
114
+ else:
115
+ raise ValueError(f"Invalid adaptation method: {method}")
116
+
117
+
118
+ def get_ranking_binary_adapter_spec(
119
+ instructions: str = "",
120
+ document_noun: str = "Passage",
121
+ query_noun: str = "Query",
122
+ output_prefix: str = "Does the passage answer the query?",
123
+ output_noun: str = "Answer",
124
+ max_train_instances: int = 4,
125
+ num_outputs: int = 1,
126
+ num_train_trials: int = 1,
127
+ temperature: float = 0.0,
128
+ max_tokens: int = 5,
129
+ **kwargs,
130
+ ) -> AdapterSpec:
131
+ """
132
+ [instructions]
133
+
134
+ [object_noun]: [object]
135
+ [query_noun]: [query]
136
+ [prompt_noun]: [prompt_content]
137
+ [output_noun]: [output]
138
+
139
+ ...
140
+
141
+ [object_noun]: [object]
142
+ [query_noun]: [query]
143
+ [prompt_noun]: [prompt_content]
144
+ [output_noun]: [output]
145
+
146
+ [object_noun]: [object]
147
+ [query_noun]: [query]
148
+ [prompt_noun]: [prompt_content]
149
+ [output_noun]: [output]
150
+ """
151
+ msg = (
152
+ "There must be an even number of in-context examples to ensure that"
153
+ "an equal number of positive and negative examples are included."
154
+ )
155
+ assert max_train_instances % 2 == 0, msg
156
+ max_train_instances = int(max_train_instances / 2)
157
+
158
+ return AdapterSpec(
159
+ method=ADAPT_RANKING_BINARY,
160
+ instructions=format_instructions(instructions),
161
+ input_prefix=f"{query_noun}: ",
162
+ input_suffix="\n",
163
+ reference_prefix=f"{document_noun}: ",
164
+ reference_suffix="\n",
165
+ output_prefix=f"{output_prefix}\n{output_noun}: ",
166
+ max_train_instances=max_train_instances,
167
+ num_outputs=num_outputs,
168
+ num_train_trials=num_train_trials,
169
+ temperature=temperature,
170
+ max_tokens=max_tokens,
171
+ **kwargs,
172
+ )
173
+
174
+
175
+ def get_completion_adapter_spec(
176
+ instructions: str = "",
177
+ input_prefix: str = "",
178
+ output_prefix: str = "",
179
+ output_suffix: str = "",
180
+ max_train_instances: int = 0,
181
+ temperature: float = 0.0,
182
+ num_outputs: int = 1,
183
+ max_tokens: int = 100,
184
+ stop_sequences: Optional[List] = None, # default value of `stop_sequences` is no stop sequence,
185
+ **kwargs,
186
+ ) -> AdapterSpec:
187
+ """
188
+ [input][output_prefix][output][output_suffix]
189
+
190
+ [input][output_prefix]
191
+ """
192
+ if stop_sequences is None:
193
+ stop_sequences = []
194
+
195
+ return AdapterSpec(
196
+ method=ADAPT_GENERATION,
197
+ instructions=format_instructions(instructions),
198
+ input_prefix=input_prefix,
199
+ input_suffix="",
200
+ output_prefix=output_prefix,
201
+ output_suffix=output_suffix,
202
+ max_train_instances=max_train_instances,
203
+ temperature=temperature,
204
+ num_outputs=num_outputs,
205
+ max_tokens=max_tokens,
206
+ stop_sequences=stop_sequences,
207
+ **kwargs,
208
+ )
209
+
210
+
211
+ def get_generation_adapter_spec(
212
+ instructions: str = "",
213
+ input_noun: Optional[str] = None,
214
+ newline_after_input_noun: bool = False,
215
+ output_noun: Optional[str] = None,
216
+ newline_after_output_noun: bool = False,
217
+ max_train_instances: int = 5,
218
+ num_outputs: int = 1,
219
+ max_tokens: int = 5,
220
+ stop_sequences: Optional[List] = None, # default value of `stop_sequences` is ["\n"]
221
+ temperature: float = 0.0,
222
+ multi_label: bool = False,
223
+ ) -> AdapterSpec:
224
+ """
225
+ [instructions]
226
+
227
+ [input_noun]: [input]
228
+ [output_noun]: [output]
229
+
230
+ [input_noun]: [input]
231
+ [output_noun]:
232
+ """
233
+
234
+ def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
235
+ """
236
+ When `append_new_line` is False:
237
+ [input_noun]: [input]
238
+
239
+ When `append_new_line` is True:
240
+ [input_noun]:
241
+ [input]
242
+ """
243
+ prefix: str = f"{noun}:" if noun is not None else ""
244
+ if len(prefix) > 0:
245
+ prefix += "\n" if append_new_line else " "
246
+ return prefix
247
+
248
+ if stop_sequences is None:
249
+ stop_sequences = ["\n"]
250
+
251
+ return AdapterSpec(
252
+ method=ADAPT_GENERATION,
253
+ instructions=format_instructions(instructions),
254
+ input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun),
255
+ input_suffix="\n",
256
+ output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun),
257
+ output_suffix="\n",
258
+ max_train_instances=max_train_instances,
259
+ num_outputs=num_outputs,
260
+ max_tokens=max_tokens,
261
+ temperature=temperature,
262
+ stop_sequences=stop_sequences,
263
+ multi_label=multi_label,
264
+ )
265
+
266
+
267
+ def get_instruct_adapter_spec(
268
+ num_outputs: int = 1,
269
+ max_tokens: int = 512,
270
+ temperature: float = 0.7,
271
+ ) -> AdapterSpec:
272
+ """
273
+ Zero-shot instruction-following.
274
+ """
275
+ return AdapterSpec(
276
+ method=ADAPT_GENERATION,
277
+ instructions="",
278
+ input_prefix="",
279
+ input_suffix="\n",
280
+ output_prefix="",
281
+ output_suffix="",
282
+ max_train_instances=0,
283
+ num_outputs=num_outputs,
284
+ max_tokens=max_tokens,
285
+ temperature=temperature,
286
+ stop_sequences=[],
287
+ )
288
+
289
+
290
+ def get_few_shot_instruct_adapter_spec(
291
+ num_outputs: int = 1,
292
+ max_tokens: int = 512,
293
+ temperature: float = 0.7,
294
+ max_train_instances: int = 0,
295
+ ) -> AdapterSpec:
296
+ """
297
+ Few-shot instruction-following.
298
+ """
299
+ return AdapterSpec(
300
+ method=ADAPT_GENERATION,
301
+ instructions="",
302
+ input_prefix="",
303
+ input_suffix="\n",
304
+ output_prefix="",
305
+ output_suffix="",
306
+ max_train_instances=max_train_instances,
307
+ num_outputs=num_outputs,
308
+ max_tokens=max_tokens,
309
+ temperature=temperature,
310
+ stop_sequences=[],
311
+ )
312
+
313
+
314
+ def get_language_modeling_adapter_spec() -> AdapterSpec:
315
+ """
316
+ Used for language modeling.
317
+ """
318
+ return AdapterSpec(
319
+ method=ADAPT_LANGUAGE_MODELING,
320
+ instructions="",
321
+ input_prefix="",
322
+ input_suffix="",
323
+ output_prefix="",
324
+ output_suffix="",
325
+ max_train_instances=0,
326
+ num_outputs=1,
327
+ max_tokens=0,
328
+ temperature=0.0,
329
+ )
330
+
331
+
332
+ def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
333
+ """
334
+ Used for summarization.
335
+ """
336
+
337
+ if num_sents == 1:
338
+ out_pref = "Summarize the above article in 1 sentence.\n"
339
+ elif num_sents is None:
340
+ out_pref = "Summarize the above article.\n"
341
+ else:
342
+ out_pref = f"Summarize the above article in {num_sents} sentences.\n"
343
+
344
+ return AdapterSpec(
345
+ method=ADAPT_GENERATION,
346
+ instructions="",
347
+ input_prefix="###\nArticle: ",
348
+ input_suffix="\n\n",
349
+ output_prefix=out_pref,
350
+ output_suffix="\n",
351
+ max_train_instances=max_train_instances,
352
+ num_outputs=1,
353
+ stop_sequences=["###"], # Separator between few-shot instances.
354
+ **kwargs,
355
+ )
356
+
357
+
358
+ def get_machine_translation_adapter_spec(
359
+ source_language, target_language, max_train_instances, **kwargs
360
+ ) -> AdapterSpec:
361
+ """
362
+ Used for machine translation.
363
+ """
364
+ return AdapterSpec(
365
+ method=ADAPT_GENERATION,
366
+ instructions=f"Translate the following sentences from {source_language} to {target_language}.",
367
+ input_prefix=f"{source_language}: ",
368
+ input_suffix="\n",
369
+ output_prefix=f"{target_language}: ",
370
+ output_suffix="\n",
371
+ max_train_instances=max_train_instances,
372
+ num_outputs=1,
373
+ stop_sequences=["\n\n"],
374
+ temperature=0.0,
375
+ **kwargs,
376
+ )
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import Optional, Dict, List
2
+ from typing import Optional, Dict, List, Any
3
3
 
4
4
  from helm.benchmark.scenarios.scenario import Instance
5
5
  from helm.common.general import indent_lines, format_text_lines, serialize
@@ -45,6 +45,11 @@ class RequestState:
45
45
  num_conditioning_tokens: int = 0
46
46
  """The number of initial tokens that will be ignored when computing language modeling metrics"""
47
47
 
48
+ annotations: Optional[Dict[str, Any]] = None
49
+ """Output of some post-processing step that is needed for the metric to understand the request
50
+ Should match the annotator's name to an Annotation (usually a list of dictionaries for each completion)
51
+ Example: parsing, rendering an image based on the text completion, etc."""
52
+
48
53
  def __post_init__(self):
49
54
  if self.request_mode:
50
55
  assert self.request_mode in ["original", "calibration"], f"Invalid request_mode: {self.request_mode}"
@@ -3,8 +3,9 @@ from dataclasses import dataclass
3
3
  from typing import List, Dict, Tuple, Optional
4
4
 
5
5
  from helm.benchmark.scenarios.scenario import Instance
6
- from .adapter_spec import AdapterSpec
7
- from .request_state import RequestState
6
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
7
+ from helm.benchmark.adaptation.request_state import RequestState
8
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
8
9
 
9
10
 
10
11
  @dataclass
@@ -21,6 +22,9 @@ class ScenarioState:
21
22
  # List of `RequestState`s that were produced by adaptation (and execution)
22
23
  request_states: List[RequestState]
23
24
 
25
+ # Annotations to use for this run spec
26
+ annotator_specs: Optional[List[AnnotatorSpec]] = None
27
+
24
28
  def __post_init__(self):
25
29
  # Create derived indices based on `request_states` so it's easier for
26
30
  # the `Metric` later to access them. Two things are produced:
@@ -0,0 +1,43 @@
1
+ from typing import Dict, List, Any
2
+ from abc import abstractmethod, ABC
3
+ from dataclasses import dataclass
4
+
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.common.object_spec import ObjectSpec
7
+
8
+
9
+ class Annotator(ABC):
10
+ """Annotator is an abstract class for annotating a request state. Annotators are used to add additional
11
+ information to a request state that is needed for a metric to understand the request. This could be
12
+ parsing, rendering an image based on the text completion, etc."""
13
+
14
+ name: str
15
+ """Name of the annotator. Should be filled in by the subclass."""
16
+
17
+ @abstractmethod
18
+ def annotate(self, request_state: RequestState) -> Any:
19
+ """Fills the annotations field of the request state with additional information
20
+ that are implementation specific."""
21
+ pass
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class AnnotatorSpec(ObjectSpec):
26
+ """Specifies how to create an `Annotator`.
27
+ The user should only specify the class name.
28
+ The arguments will be filled in by the `AnnotatorFactory`.
29
+ """
30
+
31
+ pass
32
+
33
+
34
+ class DummyAnnotator(Annotator):
35
+ """A dummy annotator that does nothing."""
36
+
37
+ name = "dummy"
38
+
39
+ def annotate(self, request_state: RequestState) -> List[Dict[str, Any]]:
40
+ if request_state.result is None:
41
+ raise ValueError("Annotation requires a result")
42
+ annotation_values: List[str] = [completion.text.upper() for completion in request_state.result.completions]
43
+ return [{"all_caps": value} for value in annotation_values]
@@ -0,0 +1,61 @@
1
+ import os
2
+ from typing import Any, Dict, Mapping, Optional
3
+
4
+ from helm.common.credentials_utils import provide_api_key
5
+ from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
6
+ from helm.common.hierarchical_logger import hlog
7
+ from helm.common.object_spec import create_object, inject_object_spec_args
8
+ from helm.benchmark.annotation.annotator import Annotator, AnnotatorSpec
9
+
10
+
11
+ class AnnotatorFactory:
12
+ """Factory for creating annotators."""
13
+
14
+ def __init__(
15
+ self, credentials: Mapping[str, Any], file_storage_path: str, cache_backend_config: CacheBackendConfig
16
+ ):
17
+ self.credentials = credentials
18
+ self.file_storage_path = file_storage_path
19
+ self.cache_backend_config = cache_backend_config
20
+ hlog(f"AnnotatorFactory: file_storage_path = {file_storage_path}")
21
+ hlog(f"AnnotatorFactory: cache_backend_config = {cache_backend_config}")
22
+
23
+ # Cache for annotators
24
+ # This is used to prevent duplicate creation of annotators
25
+ # It is especially important as annotation is a multi-threaded
26
+ # process and creating a new annotator for each request can cause
27
+ # race conditions.
28
+ self.annotators: Dict[str, Annotator] = {}
29
+
30
+ def get_annotator(self, annotator_spec: AnnotatorSpec) -> Annotator:
31
+ """Return a annotator based on the name."""
32
+ # First try to find the annotator in the cache
33
+ assert annotator_spec.args is None or annotator_spec.args == {}
34
+ annotator_name: str = annotator_spec.class_name.split(".")[-1].lower().replace("annotator", "")
35
+ annotator: Optional[Annotator] = self.annotators.get(annotator_name)
36
+ if annotator is not None:
37
+ return annotator
38
+
39
+ # Otherwise, create the client
40
+ cache_config: CacheConfig = self.cache_backend_config.get_cache_config(annotator_name)
41
+ annotator_spec = inject_object_spec_args(
42
+ annotator_spec,
43
+ constant_bindings={
44
+ "cache_config": cache_config,
45
+ },
46
+ provider_bindings={
47
+ "api_key": lambda: provide_api_key(self.credentials, annotator_name),
48
+ "file_storage_path": lambda: self._get_file_storage_path(annotator_name),
49
+ },
50
+ )
51
+ annotator = create_object(annotator_spec)
52
+
53
+ # Cache the client
54
+ self.annotators[annotator_name] = annotator
55
+
56
+ return annotator
57
+
58
+ def _get_file_storage_path(self, annotator_name: str) -> str:
59
+ # Returns the path to use for a local file cache for the given annotator
60
+ local_file_cache_path: str = os.path.join(self.file_storage_path, "output", annotator_name)
61
+ return local_file_cache_path
@@ -0,0 +1,88 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, List, Tuple, Callable
3
+
4
+ from dacite import from_dict
5
+
6
+ from helm.benchmark.annotation.annotator import Annotator
7
+ from helm.benchmark.adaptation.request_state import RequestState
8
+ from helm.common.cache import Cache, CacheConfig
9
+ from helm.common.file_caches.local_file_cache import LocalPILFileCache
10
+ from helm.common.optional_dependencies import handle_module_not_found_error
11
+ from helm.common.media_object import MediaObject
12
+ from helm.proxy.retry import get_retry_decorator
13
+
14
+ try:
15
+ from PIL import Image
16
+ except ModuleNotFoundError as e:
17
+ handle_module_not_found_error(e, suggestions=["images"])
18
+
19
+
20
+ def retry_if_compilation_failed(result: Dict[str, Any]) -> bool:
21
+ """Retries when the compilation fails."""
22
+ return "unknown_error" in result
23
+
24
+
25
+ retry: Callable = get_retry_decorator(
26
+ "Compilation", max_attempts=5, wait_exponential_multiplier_seconds=2, retry_on_result=retry_if_compilation_failed
27
+ )
28
+
29
+
30
+ class CompilationError(Exception):
31
+ pass
32
+
33
+
34
+ class ImageCompilerAnnotator(Annotator, ABC):
35
+ """Annotator that compiles the text completions into an image."""
36
+
37
+ def __init__(self, cache_config: CacheConfig, file_storage_path: str):
38
+ self._cache = Cache(cache_config)
39
+ self._file_cache = LocalPILFileCache(file_storage_path)
40
+
41
+ @abstractmethod
42
+ def compile_completion_into_image(
43
+ self, request_state: RequestState, completion_text: str
44
+ ) -> Tuple[Image.Image, Dict[str, Any]]:
45
+ raise NotImplementedError
46
+
47
+ def postprocess_infos(self, infos: Dict[str, Any]) -> Dict[str, Any]:
48
+ """Postprocess the infos."""
49
+ return infos
50
+
51
+ def annotate(self, request_state: RequestState) -> List[Dict[str, Any]]:
52
+ """Fills the annotations field of the request state with the compiled image."""
53
+ assert request_state.result is not None, "Annotator can only be used after the request has been processed."
54
+ annotations: List[Dict[str, Any]] = []
55
+ for completion in request_state.result.completions:
56
+ completion_text: str = completion.text.strip()
57
+ raw_response: Dict[str, Any]
58
+
59
+ @retry
60
+ def compile() -> Dict[str, Any]:
61
+ def do_it() -> Dict[str, Any]:
62
+ try:
63
+ assert self._file_cache is not None
64
+ image, infos = self.compile_completion_into_image(request_state, completion_text)
65
+ infos = self.postprocess_infos(infos)
66
+ image_path: str = self._file_cache.store_image(lambda: image)
67
+ return {
68
+ "media_object": MediaObject(location=image_path, content_type="image/png").to_dict(),
69
+ **infos,
70
+ }
71
+ except CompilationError as e:
72
+ return {"error": str(e)}
73
+
74
+ try:
75
+ cache_key: Dict[str, str] = {"completion": completion_text}
76
+ raw_response, _ = self._cache.get(cache_key, do_it)
77
+ return raw_response
78
+ except Exception as e:
79
+ return {"unknown_error": str(e)}
80
+
81
+ raw_response = compile()
82
+ response = {**raw_response}
83
+ if "media_object" in response:
84
+ response["media_object"] = from_dict(MediaObject, response["media_object"])
85
+
86
+ # Merge annotations
87
+ annotations.append(response)
88
+ return annotations
@@ -0,0 +1,59 @@
1
+ from typing import List, Tuple, Dict, Any
2
+
3
+ from helm.benchmark.annotation.image2structure.image_compiler_annotator import ImageCompilerAnnotator, CompilationError
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.common.optional_dependencies import handle_module_not_found_error
6
+ from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
7
+ latex_to_image,
8
+ strip_unnecessary_latex_parts,
9
+ )
10
+
11
+ try:
12
+ from PIL import Image
13
+ except ModuleNotFoundError as e:
14
+ handle_module_not_found_error(e, suggestions=["images"])
15
+
16
+
17
+ class LatexCompilerAnnotator(ImageCompilerAnnotator):
18
+ """Annotator that compiles the text completions into a LaTeX document."""
19
+
20
+ name: str = "latex_compiler"
21
+
22
+ # Delimiters for the code block
23
+ DELIMITERS: List[Tuple[str, str]] = [
24
+ ("```latex", "```"),
25
+ ("```", "```"),
26
+ ]
27
+
28
+ def postprocess_infos(self, infos: Dict[str, Any]) -> Dict[str, Any]:
29
+ """Postprocess the infos."""
30
+ annotations = super().postprocess_infos(infos)
31
+ assert "latex_code" in annotations, "The latex_code field should be present in the infos"
32
+ annotations["text"] = strip_unnecessary_latex_parts(annotations["latex_code"])
33
+ return annotations
34
+
35
+ def compile_completion_into_image(
36
+ self, request_state: RequestState, completion_text: str
37
+ ) -> Tuple[Image.Image, Dict[str, Any]]:
38
+ """Given a completion, parse the LaTeX and compile it into an image."""
39
+ # Get the assets path
40
+ assets_path: str = ""
41
+
42
+ # Check for code block delimiters
43
+ # After this completion should be a valid latex code block
44
+ for start, end in self.DELIMITERS:
45
+ if start in completion_text and end in completion_text[completion_text.index(start) + len(start) :]:
46
+ start_index = completion_text.index(start) + len(start)
47
+ end_index = completion_text.index(end, start_index)
48
+ completion_text = completion_text[start_index:end_index]
49
+ break
50
+
51
+ # Convert the latex code to an image
52
+ try:
53
+ image, infos = latex_to_image(completion_text, assets_path, crop=True)
54
+ except RuntimeError as e:
55
+ # We do not want to catch OptionalDependencyNotInstalled (error with latex installation)
56
+ # because it is a fatal error and should be handled by the user
57
+ raise CompilationError(str(e)) from e
58
+
59
+ return image, infos