crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,84 @@
1
+ from typing import Tuple, Dict, Any
2
+ import os
3
+ import subprocess
4
+ import tempfile
5
+
6
+ from helm.benchmark.annotation.image2structure.image_compiler_annotator import ImageCompilerAnnotator, CompilationError
7
+ from helm.benchmark.adaptation.request_state import RequestState
8
+ from helm.common.cache import CacheConfig
9
+ from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
10
+
11
+ try:
12
+ from PIL import Image, ImageOps
13
+ except ModuleNotFoundError as ex:
14
+ handle_module_not_found_error(ex, suggestions=["images"])
15
+
16
+
17
+ class LilypondCompilerAnnotator(ImageCompilerAnnotator):
18
+ """Annotator that compiles the text completions into a music sheet with LilyPond."""
19
+
20
+ name: str = "lilypond_compiler"
21
+ base_path = "/home/josselin/installs/lilypond-2.24.3/bin"
22
+
23
+ def __init__(self, cache_config: CacheConfig, file_storage_path: str):
24
+ super().__init__(cache_config, file_storage_path)
25
+ try:
26
+ result = subprocess.run([f"{self.base_path}/lilypond", "--version"], capture_output=True, text=True)
27
+ if result.returncode != 0:
28
+ raise OptionalDependencyNotInstalled(
29
+ "LilyPond is not installed. Download and install it from https://lilypond.org/download.html"
30
+ )
31
+ except FileNotFoundError as e:
32
+ raise OptionalDependencyNotInstalled(
33
+ "LilyPond is not installed. Download and install it from https://lilypond.org/download.html.\n"
34
+ f"Original error: {e}"
35
+ ) from e
36
+
37
+ def compile_completion_into_image(
38
+ self, request_state: RequestState, completion_text: str
39
+ ) -> Tuple[Image.Image, Dict[str, Any]]:
40
+ """Given a completion, compile it with LilyPond."""
41
+ # The LilyPond command requires a file on disk, so we write the completion to a temporary file
42
+ tmp = tempfile.NamedTemporaryFile()
43
+ ly_file_path: str = f"{tmp.name}.ly"
44
+ with open(ly_file_path, "w") as f:
45
+ f.write(completion_text)
46
+
47
+ # What we pass in as -o should be the same name as the .ly file, but without the extension
48
+ output_path: str = ly_file_path.replace(".ly", "")
49
+ # The image file of the music sheet should have the same path as the .ly file, but with .png extension
50
+ sheet_music_path: str = ly_file_path.replace(".ly", ".png")
51
+
52
+ try:
53
+ # Edits the LilyPond file to be compatible with the current version
54
+ result = subprocess.run(
55
+ [f"{self.base_path}/convert-ly", "-e", ly_file_path], capture_output=True, text=True
56
+ )
57
+ assert result.returncode == 0, f"convert-ly failed: {result.stderr}"
58
+
59
+ # Generate PNG image from the LilyPond file
60
+ # LilyPond supports partial compilation, which means it attempts to produce an image
61
+ # for the correct portions of the code, even if there are errors elsewhere
62
+ subprocess.run(
63
+ [f"{self.base_path}/lilypond", "--png", "-o", output_path, ly_file_path], capture_output=True, text=True
64
+ )
65
+ # If an image file is not generated, we consider it an absolute compilation failure
66
+ assert os.path.exists(sheet_music_path), "lilypond did not generate the image"
67
+
68
+ # Load the image as a PIL Image object
69
+ image = Image.open(sheet_music_path)
70
+
71
+ # Crop the image to remove the white space around the music sheet
72
+ (w, h) = image.size
73
+ image = image.crop((0, 0, w, h - int(h * 0.2))) # Remove pagination
74
+ image = image.crop(ImageOps.invert(image).getbbox()) # Remove white border
75
+ except (AssertionError, RuntimeError) as e:
76
+ raise CompilationError(str(e)) from e
77
+ finally:
78
+ # Clean up the temporary files
79
+ if os.path.exists(ly_file_path):
80
+ os.remove(ly_file_path)
81
+ if os.path.exists(sheet_music_path):
82
+ os.remove(sheet_music_path)
83
+
84
+ return image, dict()
@@ -0,0 +1,132 @@
1
+ from typing import List, Tuple, Optional, Dict, Any
2
+ import json
3
+ import os
4
+ import shutil
5
+ import threading
6
+
7
+ from helm.benchmark.annotation.image2structure.image_compiler_annotator import ImageCompilerAnnotator, CompilationError
8
+ from helm.benchmark.adaptation.request_state import RequestState
9
+ from helm.common.optional_dependencies import handle_module_not_found_error
10
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import ScreenshotOptions
11
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
12
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario import serve_and_take_screenshot
13
+ from helm.benchmark.scenarios.scenario import ASSET_NAME_TAG, ASSET_PATH_TAG
14
+ from helm.common.general import ensure_directory_exists
15
+ from helm.common.cache import CacheConfig
16
+
17
+ try:
18
+ from PIL import Image
19
+ from html2text import HTML2Text
20
+ except ModuleNotFoundError as e:
21
+ handle_module_not_found_error(e, suggestions=["image2structure"])
22
+
23
+
24
+ class WebpageCompilerAnnotator(ImageCompilerAnnotator):
25
+ """Annotator that compiles the text completions into a webpage
26
+ And takes a screenshot of the webpage."""
27
+
28
+ name: str = "webpage_compiler"
29
+
30
+ # Delimiters for the code block
31
+ DELIMITERS: List[Tuple[str, str]] = [
32
+ ("```json", "```"),
33
+ ("```", "```"),
34
+ ]
35
+
36
+ def __init__(self, cache_config: CacheConfig, file_storage_path: str):
37
+ super().__init__(cache_config, file_storage_path)
38
+ self._html2text = HTML2Text()
39
+ self._html2text.ignore_links = True
40
+
41
+ def postprocess_infos(self, infos: Dict[str, Any]) -> Dict[str, Any]:
42
+ """Postprocess the infos."""
43
+ annotations = super().postprocess_infos(infos)
44
+ assert "html" in annotations, "The html field should be present in the infos"
45
+ annotations["text"] = convert_html_to_text(self._html2text, infos["html"])
46
+ return annotations
47
+
48
+ def compile_completion_into_image(
49
+ self, request_state: RequestState, completion_text: str
50
+ ) -> Tuple[Image.Image, Dict[str, Any]]:
51
+ """Given a completion, parse the code and compile it into an image and return the image and the infos."""
52
+ # Create a temporary directory to store the files
53
+ cache_config: CacheConfig = self._cache.config
54
+ repo_path: str = "prod_env/tmp"
55
+ if hasattr(cache_config, "path"):
56
+ repo_path = os.path.join(os.path.dirname(cache_config.path), "tmp")
57
+ # Make the repo path thread safe by adding the thread id
58
+ repo_path = f"{repo_path}_{threading.get_ident()}"
59
+ ensure_directory_exists(repo_path)
60
+
61
+ # Check for code block delimiters
62
+ # After this completion should be a valid json object
63
+ for start, end in self.DELIMITERS:
64
+ if start in completion_text and end in completion_text[completion_text.index(start) + len(start) :]:
65
+ start_index = completion_text.index(start) + len(start)
66
+ end_index = completion_text.index(end, start_index)
67
+ completion_text = completion_text[start_index:end_index]
68
+ break
69
+
70
+ # Parse code into json object
71
+ structure: dict
72
+ try:
73
+ structure = json.loads(completion_text)
74
+ except json.JSONDecodeError as e:
75
+ raise CompilationError(f"Failed to parse the completion as a JSON object: {e}") from e
76
+
77
+ # Copy the assets
78
+ assets_paths: List[str] = []
79
+ assets_names: List[str] = []
80
+ for reference in request_state.instance.references:
81
+ if ASSET_PATH_TAG in reference.tags:
82
+ assert reference.output.multimedia_content is not None
83
+ for media_object in reference.output.multimedia_content.media_objects:
84
+ assert media_object.is_local_file
85
+ assert media_object.is_type("image")
86
+ assert type(media_object.location) == str
87
+ assets_paths.append(media_object.location)
88
+ if ASSET_NAME_TAG in reference.tags:
89
+ assert reference.output.multimedia_content is not None
90
+ for media_object in reference.output.multimedia_content.media_objects:
91
+ assert media_object.is_type("text")
92
+ assert type(media_object.text) == str
93
+ assets_names.append(media_object.text)
94
+ assert len(assets_paths) == len(assets_names)
95
+ for asset_path, asset_name in zip(assets_paths, assets_names):
96
+ dest_path: str = os.path.join(repo_path, asset_name)
97
+ # Make sure the parent directory exists
98
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
99
+ shutil.copyfile(asset_path, dest_path)
100
+ # os.symlink(asset_path, dest_path)
101
+
102
+ # Create each file in a temporary directory
103
+ if not isinstance(structure, list):
104
+ raise CompilationError("The completion should be a list of files")
105
+ for item in structure:
106
+ filename: Optional[str] = item.get("filename")
107
+ content: Optional[str] = item.get("content")
108
+ if filename is None or content is None:
109
+ raise CompilationError("Each file should have a valid filename and content")
110
+ # Create parent directories if they do not exist
111
+ if filename in assets_names:
112
+ # Some models will include assets in their response like this:
113
+ # {
114
+ # "filename": "chmber.jpg",
115
+ # "content": "The content of the chmber.jpg file is a binary image and cannot be displayed as text."
116
+ # }
117
+ # In this case, we skip the file creation
118
+ continue
119
+ parent_dir = os.path.join(repo_path, os.path.dirname(filename))
120
+ os.makedirs(parent_dir, exist_ok=True)
121
+ with open(os.path.join(repo_path, filename), "w") as f:
122
+ f.write(content)
123
+
124
+ # Save the screenshot, loads the image and remove the file
125
+ destination_path: str = os.path.join(repo_path, "output.png")
126
+ infos: Dict[str, Any] = serve_and_take_screenshot(repo_path, destination_path, ScreenshotOptions())
127
+ image: Image.Image = Image.open(destination_path)
128
+
129
+ # Delete the repository
130
+ shutil.rmtree(repo_path, ignore_errors=True)
131
+
132
+ return image, infos
@@ -0,0 +1,26 @@
1
+ from typing import Any, Dict
2
+ import os
3
+ import shutil
4
+
5
+ from helm.benchmark.annotation.annotator_factory import AnnotatorFactory
6
+ from helm.benchmark.annotation.annotator import Annotator, AnnotatorSpec
7
+ from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
8
+
9
+
10
+ class TestAnnotatorFactory:
11
+ def setup_method(self):
12
+ credentials: Dict[str, Any] = {}
13
+ cache_config = BlackHoleCacheBackendConfig()
14
+ self.file_storage_path: str = "tmp"
15
+ self.annotator_factory = AnnotatorFactory(credentials, self.file_storage_path, cache_config)
16
+
17
+ def teardown_method(self):
18
+ if os.path.exists(self.file_storage_path):
19
+ shutil.rmtree(self.file_storage_path)
20
+
21
+ def test_get_annotator(self):
22
+ annotator = self.annotator_factory.get_annotator(
23
+ AnnotatorSpec(class_name="helm.benchmark.annotation.annotator.DummyAnnotator")
24
+ )
25
+ assert isinstance(annotator, Annotator)
26
+ assert annotator.name == "dummy"
@@ -0,0 +1,44 @@
1
+ from typing import Any
2
+ from dataclasses import replace
3
+ import pytest
4
+
5
+ from helm.benchmark.annotation.annotator import Annotator, DummyAnnotator
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.scenarios.scenario import Instance, Input
8
+ from helm.common.request import Request, RequestResult, GeneratedOutput
9
+
10
+
11
+ class TestDummyAnnotator:
12
+ def setup_method(self):
13
+ self.annotator: Annotator = DummyAnnotator()
14
+ self.request_state = RequestState(
15
+ instance=Instance(input=Input(text="hello world"), references=[]),
16
+ request=Request(),
17
+ request_mode="original",
18
+ output_mapping=None,
19
+ result=None,
20
+ train_trial_index=0,
21
+ num_train_instances=0,
22
+ prompt_truncated=False,
23
+ reference_index=None,
24
+ )
25
+
26
+ def test_annotate(self):
27
+ request_state: RequestState = replace(
28
+ self.request_state,
29
+ result=RequestResult(
30
+ success=True,
31
+ embedding=[],
32
+ completions=[GeneratedOutput(text="How are you?", logprob=0, tokens=[])],
33
+ cached=True,
34
+ ),
35
+ )
36
+ annotations: Any = self.annotator.annotate(request_state)
37
+ assert annotations is not None
38
+ assert "all_caps" in annotations[0]
39
+ assert request_state.result is not None # To make mypy happy
40
+ assert annotations[0]["all_caps"] == request_state.result.completions[0].text.upper()
41
+
42
+ def test_annotate_no_result(self):
43
+ with pytest.raises(ValueError):
44
+ self.annotator.annotate(self.request_state)
@@ -0,0 +1,124 @@
1
+ import os
2
+
3
+ from typing import Optional, List, Dict, Any
4
+ from dataclasses import dataclass, replace
5
+ from helm.common.cache_backend_config import (
6
+ CacheBackendConfig,
7
+ BlackHoleCacheBackendConfig,
8
+ MongoCacheBackendConfig,
9
+ SqliteCacheBackendConfig,
10
+ )
11
+
12
+ from helm.common.general import ensure_directory_exists, parallel_map, get_credentials
13
+ from helm.common.hierarchical_logger import htrack, hlog
14
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
15
+ from helm.benchmark.adaptation.request_state import RequestState
16
+ from helm.benchmark.annotation.annotator import AnnotatorSpec, Annotator
17
+ from helm.benchmark.annotation.annotator_factory import AnnotatorFactory
18
+ from helm.proxy.services.service import CACHE_DIR
19
+
20
+
21
+ class AnnotationExecutorError(Exception):
22
+ pass
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class AnnotationExecutionSpec:
27
+
28
+ local_path: str
29
+ """Path where API credentials and cache is stored.
30
+
31
+ This path is the same as `--base-path` when launching the proxy server (see server.py).
32
+ Required when url is not set."""
33
+
34
+ parallelism: int
35
+ """How many threads to have at once"""
36
+
37
+ dry_run: bool = False
38
+ """Whether to skip execution"""
39
+
40
+ sqlite_cache_backend_config: Optional[SqliteCacheBackendConfig] = None
41
+ """If set, SQLite will be used for the cache.
42
+
43
+ This specifies the directory in which the SQLite cache will store files.
44
+ At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
45
+
46
+ mongo_cache_backend_config: Optional[MongoCacheBackendConfig] = None
47
+ """If set, MongoDB will be used for the cache.
48
+
49
+ This specifies the MongoDB database to be used by the MongoDB cache.
50
+ At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."""
51
+
52
+
53
+ class AnnotationExecutor:
54
+ """
55
+ An `Executor` takes a `ScenarioState` which has a bunch of requests.
56
+ Issue them to the API and return the results.
57
+ """
58
+
59
+ def __init__(self, execution_spec: AnnotationExecutionSpec):
60
+ self.execution_spec = execution_spec
61
+
62
+ cache_backend_config: CacheBackendConfig
63
+ if execution_spec.sqlite_cache_backend_config and execution_spec.mongo_cache_backend_config:
64
+ raise AnnotationExecutorError(
65
+ "At most one of sqlite_cache_backend_config and mongo_cache_backend_config can be set."
66
+ )
67
+ elif execution_spec.sqlite_cache_backend_config:
68
+ cache_backend_config = execution_spec.sqlite_cache_backend_config
69
+ elif execution_spec.mongo_cache_backend_config:
70
+ cache_backend_config = execution_spec.mongo_cache_backend_config
71
+ else:
72
+ cache_backend_config = BlackHoleCacheBackendConfig()
73
+
74
+ base_path: str = execution_spec.local_path
75
+ ensure_directory_exists(base_path)
76
+ client_file_storage_path = os.path.join(base_path, CACHE_DIR)
77
+ ensure_directory_exists(client_file_storage_path)
78
+ credentials: Dict[str, str] = get_credentials(base_path)
79
+ self.factory = AnnotatorFactory(
80
+ credentials=credentials,
81
+ file_storage_path=client_file_storage_path,
82
+ cache_backend_config=cache_backend_config,
83
+ )
84
+
85
+ @htrack(None)
86
+ def execute(self, scenario_state: ScenarioState) -> ScenarioState:
87
+ if self.execution_spec.dry_run:
88
+ hlog("Skipped annotation.")
89
+ return scenario_state
90
+
91
+ if scenario_state.annotator_specs is None or len(scenario_state.annotator_specs) == 0:
92
+ hlog("No annotators to run.")
93
+ return scenario_state
94
+
95
+ # Do it!
96
+ def do_it(request_state: RequestState) -> RequestState:
97
+ assert scenario_state.annotator_specs is not None
98
+ return self.process(scenario_state.annotator_specs, request_state)
99
+
100
+ self.annotator_specs = scenario_state.annotator_specs
101
+
102
+ request_states = parallel_map(
103
+ do_it,
104
+ scenario_state.request_states,
105
+ parallelism=self.execution_spec.parallelism,
106
+ )
107
+
108
+ hlog(f"Annotated {len(request_states)} requests")
109
+ return ScenarioState(
110
+ adapter_spec=scenario_state.adapter_spec,
111
+ request_states=request_states,
112
+ annotator_specs=scenario_state.annotator_specs,
113
+ )
114
+
115
+ def process(self, annotator_specs: List[AnnotatorSpec], state: RequestState) -> RequestState:
116
+ annotations: Dict[str, Any] = {}
117
+ try:
118
+ for annotator_spec in annotator_specs:
119
+ annotator: Annotator = self.factory.get_annotator(annotator_spec)
120
+ new_annotations = annotator.annotate(state)
121
+ annotations[annotator.name] = new_annotations
122
+ except Exception as e:
123
+ raise AnnotationExecutorError(f"{str(e)} Request: {state.request}") from e
124
+ return replace(state, annotations=annotations)
@@ -35,7 +35,6 @@ class Processor:
35
35
 
36
36
  @dataclass(frozen=True)
37
37
  class DataAugmenter:
38
-
39
38
  # Perturbations to apply to generate new instances
40
39
  perturbations: List[Perturbation]
41
40
 
@@ -72,7 +71,6 @@ class DataAugmenter:
72
71
 
73
72
  @dataclass(frozen=True)
74
73
  class DataAugmenterSpec:
75
-
76
74
  # List of perturbation specs to use to augment the data
77
75
  perturbation_specs: List[PerturbationSpec] = field(default_factory=list)
78
76
 
@@ -214,7 +214,7 @@ class GenderPerturbation(TextPerturbation):
214
214
  def perturb(self, text: str, rng: Random) -> str:
215
215
  """Perform the perturbations on the provided text."""
216
216
  # Substitute the words
217
- for (word, synonym) in self.word_synonym_pairs:
217
+ for word, synonym in self.word_synonym_pairs:
218
218
  text = self.substitute_word(text, word, synonym, rng)
219
219
 
220
220
  return text
@@ -10,7 +10,6 @@ from helm.common.object_spec import ObjectSpec, create_object
10
10
 
11
11
 
12
12
  class Perturbation(ABC):
13
-
14
13
  # Unique name to describe perturbation
15
14
  name: str
16
15
 
@@ -56,11 +55,18 @@ class TextPerturbation(Perturbation, ABC):
56
55
  input=Input(text=self.perturb(instance.input.text, rng)),
57
56
  references=references,
58
57
  perturbation=description,
58
+ contrast_inputs=[instance.input],
59
59
  )
60
60
 
61
61
  def _perturb_reference(self, reference: Reference, rng: Random) -> Reference:
62
62
  """Generates a new Reference by perturbing the output and tagging the Reference."""
63
- return replace(reference, output=Output(text=self.perturb(reference.output.text, rng)), tags=reference.tags)
63
+ return replace(
64
+ reference,
65
+ output=Output(
66
+ text=self.perturb(reference.output.text, rng), multimedia_content=reference.output.multimedia_content
67
+ ),
68
+ tags=reference.tags,
69
+ )
64
70
 
65
71
  @abstractmethod
66
72
  def perturb(self, text: str, rng: Random) -> str:
@@ -23,7 +23,7 @@ class PerturbationDescription:
23
23
  computed_on: str = PERTURBATION_PERTURBED
24
24
  """Which types of Instances we are evaluating, to be populated during metric evaluation. PERTURBATION_PERTURBED
25
25
  (default) means we are evaluating on perturbed instances, PERTURBATION_ORIGINAL means we are evaluating the
26
- unperturbed version of instances where this perturbation appplies, and, PERTURBATION_WORST means the the minimum
26
+ unperturbed version of instances where this perturbation applies, and, PERTURBATION_WORST means the the minimum
27
27
  metric between the two."""
28
28
 
29
29
  seed: Optional[int] = None
@@ -0,0 +1,29 @@
1
+ from dataclasses import dataclass
2
+ from random import Random
3
+
4
+ from .perturbation import TextPerturbation
5
+ from .perturbation_description import PerturbationDescription
6
+
7
+
8
+ class SuffixPerturbation(TextPerturbation):
9
+ """
10
+ Appends a suffix to the end of the text. Example:
11
+
12
+ A picture of a dog -> A picture of a dog, picasso
13
+ """
14
+
15
+ @dataclass(frozen=True)
16
+ class Description(PerturbationDescription):
17
+ suffix: str = ""
18
+
19
+ name: str = "style"
20
+
21
+ def __init__(self, suffix: str):
22
+ self._suffix: str = suffix
23
+
24
+ @property
25
+ def description(self) -> PerturbationDescription:
26
+ return SuffixPerturbation.Description(name=self.name, suffix=self._suffix)
27
+
28
+ def perturb(self, text: str, rng: Random) -> str:
29
+ return f"{text}, {self._suffix}"
@@ -15,6 +15,7 @@ from .space_perturbation import SpacePerturbation
15
15
  from .dialect_perturbation import DialectPerturbation
16
16
  from .person_name_perturbation import PersonNamePerturbation
17
17
  from .gender_perturbation import GenderPerturbation
18
+ from .suffix_perturbation import SuffixPerturbation
18
19
 
19
20
 
20
21
  def test_extra_space_perturbation():
@@ -145,7 +146,6 @@ def test_space_perturbation():
145
146
  instance: Instance = Instance(id="id0", input=Input(text="Hello World!\nQuite a day, huh?"), references=[])
146
147
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
147
148
 
148
- print(instances)
149
149
  assert len(instances) == 2
150
150
  assert instances[1].perturbation.name == "space"
151
151
  assert instances[1].input.text == "Hello World!\nQuite a day, huh?"
@@ -162,7 +162,6 @@ def test_dialect_perturbation():
162
162
  )
163
163
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
164
164
 
165
- print(instances)
166
165
  assert len(instances) == 2
167
166
  assert instances[1].perturbation.name == "dialect"
168
167
  assert instances[1].input.text == "I gon remember dis day to b the best day of mah life."
@@ -188,7 +187,6 @@ def test_person_name_perturbation():
188
187
  )
189
188
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
190
189
 
191
- print(instances)
192
190
  assert len(instances) == 2
193
191
  assert instances[1].perturbation.name == "person_name"
194
192
  assert (
@@ -209,7 +207,6 @@ def test_gender_pronoun_perturbation():
209
207
  )
210
208
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
211
209
 
212
- print(instances)
213
210
  assert len(instances) == 2
214
211
  assert instances[1].perturbation.mode == "pronouns"
215
212
  assert instances[1].input.text == "Did she mention that she was coming with her parents and their friends?"
@@ -227,13 +224,22 @@ def test_gender_term_perturbation():
227
224
  )
228
225
  instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
229
226
 
230
- print(instances)
231
227
  assert len(instances) == 2
232
228
  assert instances[1].perturbation.mode == "terms"
233
229
  assert instances[1].input.text == "His granddaughters looked a lot like their mom."
234
230
  assert instances[1].references[0].output.text == "How did their mother look like?"
235
231
 
236
232
 
233
+ def test_suffix_perturbation():
234
+ data_augmenter = DataAugmenter(perturbations=[SuffixPerturbation(suffix="pixel art")])
235
+ instance: Instance = Instance(id="id0", input=Input(text="A blue dog"), references=[])
236
+ instances: List[Instance] = data_augmenter.generate([instance], include_original=True)
237
+
238
+ assert len(instances) == 2
239
+ assert instances[1].perturbation.suffix == "pixel art"
240
+ assert instances[1].input.text == "A blue dog, pixel art"
241
+
242
+
237
243
  # TODO(#1958) Fix the logic to renable this test
238
244
  @unittest.skip("Currently cannot replace words at either text boundary.")
239
245
  def test_gender_term_perturbation_edge_word():
@@ -247,7 +253,6 @@ def test_gender_term_perturbation_edge_word():
247
253
  )
248
254
  instances: List[Instance] = data_augmenter.generate([instance], include_original=False)
249
255
 
250
- print(instances)
251
256
  assert len(instances) == 1
252
257
  assert instances[0].input.text == "mom said it is okay"
253
258
  assert instances[0].references[0].output.text == "Sure he did daughter"
@@ -266,6 +271,5 @@ def test_gender_term_perturbation_consequtive_words():
266
271
  )
267
272
  instances: List[Instance] = data_augmenter.generate([instance], include_original=False)
268
273
 
269
- print(instances)
270
274
  assert len(instances) == 1
271
275
  assert instances[0].input.text == "I'm a mom mom: my daughter has a daughter."
@@ -0,0 +1,30 @@
1
+ from dataclasses import dataclass
2
+ from random import Random
3
+
4
+ from helm.clients.google_translate_client import GoogleTranslateClient
5
+ from .perturbation import TextPerturbation
6
+ from .perturbation_description import PerturbationDescription
7
+
8
+
9
+ class TranslatePerturbation(TextPerturbation):
10
+ """
11
+ Translates to different languages.
12
+ """
13
+
14
+ @dataclass(frozen=True)
15
+ class Description(PerturbationDescription):
16
+ # Language code to translate to. Needs a default value since we inherit from `PerturbationDescription`
17
+ language_code: str = "zh-CN"
18
+
19
+ name: str = "translate"
20
+
21
+ def __init__(self, language_code: str):
22
+ self.language_code: str = language_code
23
+ self.google_translate_client = GoogleTranslateClient()
24
+
25
+ @property
26
+ def description(self) -> PerturbationDescription:
27
+ return TranslatePerturbation.Description(name=self.name, language_code=self.language_code)
28
+
29
+ def perturb(self, text: str, rng: Random) -> str:
30
+ return self.google_translate_client.translate(text, self.language_code)
@@ -4,16 +4,18 @@ import importlib_resources as resources
4
4
  from helm.benchmark.model_deployment_registry import register_model_deployments_from_path
5
5
  from helm.benchmark.model_metadata_registry import register_model_metadata_from_path
6
6
  from helm.benchmark.tokenizer_config_registry import register_tokenizer_configs_from_path
7
+ from helm.benchmark.runner_config_registry import register_runner_config_from_path
7
8
 
8
9
 
9
10
  MODEL_METADATA_FILE: str = "model_metadata.yaml"
10
11
  TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml"
11
12
  MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml"
13
+ RUNNER_CONFIG_FILE: str = "runner_config.yaml"
12
14
 
13
15
  CONFIG_PACKAGE = "helm.config"
14
16
 
15
17
 
16
- def register_configs_from_directory(dir_path) -> None:
18
+ def register_configs_from_directory(dir_path: str) -> None:
17
19
  model_metadata_path = os.path.join(dir_path, MODEL_METADATA_FILE)
18
20
  if os.path.isfile(model_metadata_path):
19
21
  register_model_metadata_from_path(model_metadata_path)
@@ -26,6 +28,10 @@ def register_configs_from_directory(dir_path) -> None:
26
28
  if os.path.isfile(model_deployments_path):
27
29
  register_model_deployments_from_path(model_deployments_path)
28
30
 
31
+ runner_config_path = os.path.join(dir_path, RUNNER_CONFIG_FILE)
32
+ if os.path.isfile(runner_config_path):
33
+ register_runner_config_from_path(runner_config_path)
34
+
29
35
 
30
36
  def register_builtin_configs_from_helm_package() -> None:
31
37
  package_path = str(resources.files(CONFIG_PACKAGE))