crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,10 +1,23 @@
1
1
  # mypy: check_untyped_defs = False
2
+ from typing import List, Set
2
3
  from helm.benchmark.scenarios.scenario import TEST_SPLIT, TRAIN_SPLIT, Instance, Input, Output, Reference, CORRECT_TAG
3
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
5
  from .adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
5
6
  from .test_adapter import TestAdapter
6
7
 
7
8
 
9
+ def _make_instance(
10
+ text: str, reference_texts: List[str], correct_references: Set[int], is_eval: bool = False
11
+ ) -> Instance:
12
+ references = []
13
+ for i, reference_text in enumerate(reference_texts):
14
+ tags = [CORRECT_TAG] if i in correct_references else []
15
+ references.append(Reference(Output(text=reference_text), tags=tags))
16
+
17
+ split = TEST_SPLIT if is_eval else TRAIN_SPLIT
18
+ return Instance(Input(text=text), references=references, split=split)
19
+
20
+
8
21
  class TestMultipleChoiceJointAdapter(TestAdapter):
9
22
  def test_sample_examples(self):
10
23
  adapter_spec = AdapterSpec(
@@ -53,6 +66,47 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
53
66
  examples = adapter.sample_examples(all_train_instances, seed=0)
54
67
  assert len(examples) == 3
55
68
 
69
+ def test_sample_examples_unique_labels(self):
70
+ """This is a demonstration of behavior reported in issue #2224."""
71
+ adapter_spec = AdapterSpec(
72
+ method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=3
73
+ )
74
+ adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
75
+ all_train_instances = [
76
+ # Three with 0 being correct.
77
+ _make_instance("one", ["0", "1"], correct_references={0}),
78
+ _make_instance("two", ["2", "3"], correct_references={0}),
79
+ _make_instance("three", ["4", "5"], correct_references={0}),
80
+ # Two with 1 being correct.
81
+ _make_instance("four", ["6", "7"], correct_references={1}),
82
+ _make_instance("five", ["8", "9"], correct_references={1}),
83
+ ]
84
+ eval_instance = _make_instance("eval", ["10", "11"], correct_references={1}, is_eval=True)
85
+ request_states = adapter.adapt(all_train_instances + [eval_instance], parallelism=1)
86
+ assert len(request_states) == 1
87
+ # In every case, we are showing that model that Output should be "A".
88
+ assert request_states[0].request.prompt == (
89
+ "Input: three\n"
90
+ "A. 4\n"
91
+ "B. 5\n"
92
+ "Output: A\n"
93
+ "\n"
94
+ "Input: two\n"
95
+ "A. 2\n"
96
+ "B. 3\n"
97
+ "Output: A\n"
98
+ "\n"
99
+ "Input: one\n"
100
+ "A. 0\n"
101
+ "B. 1\n"
102
+ "Output: A\n"
103
+ "\n"
104
+ "Input: eval\n"
105
+ "A. 10\n"
106
+ "B. 11\n"
107
+ "Output:"
108
+ )
109
+
56
110
  def test_multiple_correct_reference(self):
57
111
  adapter_spec = AdapterSpec(
58
112
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -91,9 +145,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
91
145
  ],
92
146
  split=TEST_SPLIT,
93
147
  )
94
- actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
95
- assert len(actual_instances) == 1
96
- assert actual_instances[0].request.prompt == (
148
+ request_states = adapter.adapt(train_instances + [eval_instance], parallelism=1)
149
+ assert len(request_states) == 1
150
+ assert request_states[0].request.prompt == (
97
151
  "Input: Second reference is correct\n"
98
152
  "A. First\n"
99
153
  "B. Second\n"
@@ -150,9 +204,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
150
204
  ],
151
205
  split=TEST_SPLIT,
152
206
  )
153
- actual_instances = adapter.adapt(train_instances + [eval_instance], parallelism=1).request_states
154
- assert len(actual_instances) == 1
155
- assert actual_instances[0].request.prompt == (
207
+ request_states = adapter.adapt(train_instances + [eval_instance], parallelism=1)
208
+ assert len(request_states) == 1
209
+ assert request_states[0].request.prompt == (
156
210
  "Input: Second reference is correct\n"
157
211
  "A. First\n"
158
212
  "B. Second\n"
@@ -0,0 +1,376 @@
1
+ from typing import List, Optional
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import (
4
+ ADAPT_GENERATION,
5
+ ADAPT_LANGUAGE_MODELING,
6
+ ADAPT_MULTIPLE_CHOICE_JOINT,
7
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
8
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
9
+ ADAPT_RANKING_BINARY,
10
+ AdapterSpec,
11
+ )
12
+
13
+
14
+ def format_instructions(instructions: str) -> str:
15
+ if len(instructions) > 0:
16
+ instructions += "\n"
17
+ return instructions
18
+
19
+
20
+ def get_multiple_choice_joint_adapter_spec(
21
+ instructions: str,
22
+ input_noun: Optional[str],
23
+ output_noun: str,
24
+ num_outputs: int = 5,
25
+ max_train_instances: int = 5,
26
+ max_tokens: int = 5,
27
+ sample_train: bool = True,
28
+ **kwargs,
29
+ ) -> AdapterSpec:
30
+ """
31
+ [instructions]
32
+
33
+ [input_noun]: [input]
34
+ [reference_1]
35
+ ...
36
+ [reference_k]
37
+ [output_noun]: [output]
38
+
39
+ [input_noun]: [input]
40
+ [reference_1]
41
+ ...
42
+ [reference_k]
43
+ [output_noun]:
44
+ """
45
+
46
+ return AdapterSpec(
47
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
48
+ instructions=format_instructions(instructions),
49
+ input_prefix=f"{input_noun}: " if input_noun is not None else "",
50
+ input_suffix="\n" if input_noun is not None else "",
51
+ output_prefix=f"{output_noun}: ",
52
+ output_suffix="\n",
53
+ max_train_instances=max_train_instances,
54
+ num_outputs=num_outputs,
55
+ max_tokens=max_tokens,
56
+ temperature=0.0,
57
+ stop_sequences=["\n"],
58
+ sample_train=sample_train,
59
+ **kwargs,
60
+ )
61
+
62
+
63
+ def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec:
64
+ """
65
+ [input] [reference_i]
66
+ or
67
+ [reference_i]
68
+ """
69
+ assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
70
+
71
+ return AdapterSpec(
72
+ method=method,
73
+ instructions="",
74
+ input_prefix="",
75
+ input_suffix="",
76
+ output_prefix=" " if not empty_input else "",
77
+ output_suffix="",
78
+ # Separate is basically language modeling, so can't easily use in-context examples
79
+ max_train_instances=0,
80
+ num_outputs=1,
81
+ max_tokens=0,
82
+ temperature=0.0,
83
+ )
84
+
85
+
86
+ def get_multiple_choice_adapter_spec(
87
+ method: str,
88
+ instructions: str,
89
+ input_noun: Optional[str],
90
+ output_noun: str,
91
+ max_train_instances: int = 5,
92
+ num_outputs: int = 5,
93
+ max_tokens: int = 1,
94
+ empty_input: bool = False,
95
+ sample_train: bool = True,
96
+ **kwargs,
97
+ ):
98
+ """
99
+ Toggle between joint and separate adapters.
100
+ """
101
+ if method == ADAPT_MULTIPLE_CHOICE_JOINT:
102
+ return get_multiple_choice_joint_adapter_spec(
103
+ instructions,
104
+ input_noun,
105
+ output_noun,
106
+ max_train_instances=max_train_instances,
107
+ num_outputs=num_outputs,
108
+ max_tokens=max_tokens,
109
+ sample_train=sample_train,
110
+ **kwargs,
111
+ )
112
+ elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
113
+ return get_multiple_choice_separate_adapter_spec(method, empty_input)
114
+ else:
115
+ raise ValueError(f"Invalid adaptation method: {method}")
116
+
117
+
118
+ def get_ranking_binary_adapter_spec(
119
+ instructions: str = "",
120
+ document_noun: str = "Passage",
121
+ query_noun: str = "Query",
122
+ output_prefix: str = "Does the passage answer the query?",
123
+ output_noun: str = "Answer",
124
+ max_train_instances: int = 4,
125
+ num_outputs: int = 1,
126
+ num_train_trials: int = 1,
127
+ temperature: float = 0.0,
128
+ max_tokens: int = 5,
129
+ **kwargs,
130
+ ) -> AdapterSpec:
131
+ """
132
+ [instructions]
133
+
134
+ [object_noun]: [object]
135
+ [query_noun]: [query]
136
+ [prompt_noun]: [prompt_content]
137
+ [output_noun]: [output]
138
+
139
+ ...
140
+
141
+ [object_noun]: [object]
142
+ [query_noun]: [query]
143
+ [prompt_noun]: [prompt_content]
144
+ [output_noun]: [output]
145
+
146
+ [object_noun]: [object]
147
+ [query_noun]: [query]
148
+ [prompt_noun]: [prompt_content]
149
+ [output_noun]: [output]
150
+ """
151
+ msg = (
152
+ "There must be an even number of in-context examples to ensure that"
153
+ "an equal number of positive and negative examples are included."
154
+ )
155
+ assert max_train_instances % 2 == 0, msg
156
+ max_train_instances = int(max_train_instances / 2)
157
+
158
+ return AdapterSpec(
159
+ method=ADAPT_RANKING_BINARY,
160
+ instructions=format_instructions(instructions),
161
+ input_prefix=f"{query_noun}: ",
162
+ input_suffix="\n",
163
+ reference_prefix=f"{document_noun}: ",
164
+ reference_suffix="\n",
165
+ output_prefix=f"{output_prefix}\n{output_noun}: ",
166
+ max_train_instances=max_train_instances,
167
+ num_outputs=num_outputs,
168
+ num_train_trials=num_train_trials,
169
+ temperature=temperature,
170
+ max_tokens=max_tokens,
171
+ **kwargs,
172
+ )
173
+
174
+
175
+ def get_completion_adapter_spec(
176
+ instructions: str = "",
177
+ input_prefix: str = "",
178
+ output_prefix: str = "",
179
+ output_suffix: str = "",
180
+ max_train_instances: int = 0,
181
+ temperature: float = 0.0,
182
+ num_outputs: int = 1,
183
+ max_tokens: int = 100,
184
+ stop_sequences: Optional[List] = None, # default value of `stop_sequences` is no stop sequence,
185
+ **kwargs,
186
+ ) -> AdapterSpec:
187
+ """
188
+ [input][output_prefix][output][output_suffix]
189
+
190
+ [input][output_prefix]
191
+ """
192
+ if stop_sequences is None:
193
+ stop_sequences = []
194
+
195
+ return AdapterSpec(
196
+ method=ADAPT_GENERATION,
197
+ instructions=format_instructions(instructions),
198
+ input_prefix=input_prefix,
199
+ input_suffix="",
200
+ output_prefix=output_prefix,
201
+ output_suffix=output_suffix,
202
+ max_train_instances=max_train_instances,
203
+ temperature=temperature,
204
+ num_outputs=num_outputs,
205
+ max_tokens=max_tokens,
206
+ stop_sequences=stop_sequences,
207
+ **kwargs,
208
+ )
209
+
210
+
211
+ def get_generation_adapter_spec(
212
+ instructions: str = "",
213
+ input_noun: Optional[str] = None,
214
+ newline_after_input_noun: bool = False,
215
+ output_noun: Optional[str] = None,
216
+ newline_after_output_noun: bool = False,
217
+ max_train_instances: int = 5,
218
+ num_outputs: int = 1,
219
+ max_tokens: int = 5,
220
+ stop_sequences: Optional[List] = None, # default value of `stop_sequences` is ["\n"]
221
+ temperature: float = 0.0,
222
+ multi_label: bool = False,
223
+ ) -> AdapterSpec:
224
+ """
225
+ [instructions]
226
+
227
+ [input_noun]: [input]
228
+ [output_noun]: [output]
229
+
230
+ [input_noun]: [input]
231
+ [output_noun]:
232
+ """
233
+
234
+ def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
235
+ """
236
+ When `append_new_line` is False:
237
+ [input_noun]: [input]
238
+
239
+ When `append_new_line` is True:
240
+ [input_noun]:
241
+ [input]
242
+ """
243
+ prefix: str = f"{noun}:" if noun is not None else ""
244
+ if len(prefix) > 0:
245
+ prefix += "\n" if append_new_line else " "
246
+ return prefix
247
+
248
+ if stop_sequences is None:
249
+ stop_sequences = ["\n"]
250
+
251
+ return AdapterSpec(
252
+ method=ADAPT_GENERATION,
253
+ instructions=format_instructions(instructions),
254
+ input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun),
255
+ input_suffix="\n",
256
+ output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun),
257
+ output_suffix="\n",
258
+ max_train_instances=max_train_instances,
259
+ num_outputs=num_outputs,
260
+ max_tokens=max_tokens,
261
+ temperature=temperature,
262
+ stop_sequences=stop_sequences,
263
+ multi_label=multi_label,
264
+ )
265
+
266
+
267
+ def get_instruct_adapter_spec(
268
+ num_outputs: int = 1,
269
+ max_tokens: int = 512,
270
+ temperature: float = 0.7,
271
+ ) -> AdapterSpec:
272
+ """
273
+ Zero-shot instruction-following.
274
+ """
275
+ return AdapterSpec(
276
+ method=ADAPT_GENERATION,
277
+ instructions="",
278
+ input_prefix="",
279
+ input_suffix="\n",
280
+ output_prefix="",
281
+ output_suffix="",
282
+ max_train_instances=0,
283
+ num_outputs=num_outputs,
284
+ max_tokens=max_tokens,
285
+ temperature=temperature,
286
+ stop_sequences=[],
287
+ )
288
+
289
+
290
+ def get_few_shot_instruct_adapter_spec(
291
+ num_outputs: int = 1,
292
+ max_tokens: int = 512,
293
+ temperature: float = 0.7,
294
+ max_train_instances: int = 0,
295
+ ) -> AdapterSpec:
296
+ """
297
+ Few-shot instruction-following.
298
+ """
299
+ return AdapterSpec(
300
+ method=ADAPT_GENERATION,
301
+ instructions="",
302
+ input_prefix="",
303
+ input_suffix="\n",
304
+ output_prefix="",
305
+ output_suffix="",
306
+ max_train_instances=max_train_instances,
307
+ num_outputs=num_outputs,
308
+ max_tokens=max_tokens,
309
+ temperature=temperature,
310
+ stop_sequences=[],
311
+ )
312
+
313
+
314
+ def get_language_modeling_adapter_spec() -> AdapterSpec:
315
+ """
316
+ Used for language modeling.
317
+ """
318
+ return AdapterSpec(
319
+ method=ADAPT_LANGUAGE_MODELING,
320
+ instructions="",
321
+ input_prefix="",
322
+ input_suffix="",
323
+ output_prefix="",
324
+ output_suffix="",
325
+ max_train_instances=0,
326
+ num_outputs=1,
327
+ max_tokens=0,
328
+ temperature=0.0,
329
+ )
330
+
331
+
332
+ def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
333
+ """
334
+ Used for summarization.
335
+ """
336
+
337
+ if num_sents == 1:
338
+ out_pref = "Summarize the above article in 1 sentence.\n"
339
+ elif num_sents is None:
340
+ out_pref = "Summarize the above article.\n"
341
+ else:
342
+ out_pref = f"Summarize the above article in {num_sents} sentences.\n"
343
+
344
+ return AdapterSpec(
345
+ method=ADAPT_GENERATION,
346
+ instructions="",
347
+ input_prefix="###\nArticle: ",
348
+ input_suffix="\n\n",
349
+ output_prefix=out_pref,
350
+ output_suffix="\n",
351
+ max_train_instances=max_train_instances,
352
+ num_outputs=1,
353
+ stop_sequences=["###"], # Separator between few-shot instances.
354
+ **kwargs,
355
+ )
356
+
357
+
358
+ def get_machine_translation_adapter_spec(
359
+ source_language, target_language, max_train_instances, **kwargs
360
+ ) -> AdapterSpec:
361
+ """
362
+ Used for machine translation.
363
+ """
364
+ return AdapterSpec(
365
+ method=ADAPT_GENERATION,
366
+ instructions=f"Translate the following sentences from {source_language} to {target_language}.",
367
+ input_prefix=f"{source_language}: ",
368
+ input_suffix="\n",
369
+ output_prefix=f"{target_language}: ",
370
+ output_suffix="\n",
371
+ max_train_instances=max_train_instances,
372
+ num_outputs=1,
373
+ stop_sequences=["\n\n"],
374
+ temperature=0.0,
375
+ **kwargs,
376
+ )
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import Optional, Dict, List
2
+ from typing import Optional, Dict, List, Any
3
3
 
4
4
  from helm.benchmark.scenarios.scenario import Instance
5
5
  from helm.common.general import indent_lines, format_text_lines, serialize
@@ -45,6 +45,11 @@ class RequestState:
45
45
  num_conditioning_tokens: int = 0
46
46
  """The number of initial tokens that will be ignored when computing language modeling metrics"""
47
47
 
48
+ annotations: Optional[Dict[str, Any]] = None
49
+ """Output of some post-processing step that is needed for the metric to understand the request
50
+ Should match the annotator's name to an Annotation (usually a list of dictionaries for each completion)
51
+ Example: parsing, rendering an image based on the text completion, etc."""
52
+
48
53
  def __post_init__(self):
49
54
  if self.request_mode:
50
55
  assert self.request_mode in ["original", "calibration"], f"Invalid request_mode: {self.request_mode}"
@@ -3,8 +3,9 @@ from dataclasses import dataclass
3
3
  from typing import List, Dict, Tuple, Optional
4
4
 
5
5
  from helm.benchmark.scenarios.scenario import Instance
6
- from .adapter_spec import AdapterSpec
7
- from .request_state import RequestState
6
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
7
+ from helm.benchmark.adaptation.request_state import RequestState
8
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
8
9
 
9
10
 
10
11
  @dataclass
@@ -21,6 +22,9 @@ class ScenarioState:
21
22
  # List of `RequestState`s that were produced by adaptation (and execution)
22
23
  request_states: List[RequestState]
23
24
 
25
+ # Annotations to use for this run spec
26
+ annotator_specs: Optional[List[AnnotatorSpec]] = None
27
+
24
28
  def __post_init__(self):
25
29
  # Create derived indices based on `request_states` so it's easier for
26
30
  # the `Metric` later to access them. Two things are produced:
@@ -0,0 +1,43 @@
1
+ from typing import Dict, List, Any
2
+ from abc import abstractmethod, ABC
3
+ from dataclasses import dataclass
4
+
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.common.object_spec import ObjectSpec
7
+
8
+
9
+ class Annotator(ABC):
10
+ """Annotator is an abstract class for annotating a request state. Annotators are used to add additional
11
+ information to a request state that is needed for a metric to understand the request. This could be
12
+ parsing, rendering an image based on the text completion, etc."""
13
+
14
+ name: str
15
+ """Name of the annotator. Should be filled in by the subclass."""
16
+
17
+ @abstractmethod
18
+ def annotate(self, request_state: RequestState) -> Any:
19
+ """Fills the annotations field of the request state with additional information
20
+ that are implementation specific."""
21
+ pass
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class AnnotatorSpec(ObjectSpec):
26
+ """Specifies how to create an `Annotator`.
27
+ The user should only specify the class name.
28
+ The arguments will be filled in by the `AnnotatorFactory`.
29
+ """
30
+
31
+ pass
32
+
33
+
34
+ class DummyAnnotator(Annotator):
35
+ """A dummy annotator that does nothing."""
36
+
37
+ name = "dummy"
38
+
39
+ def annotate(self, request_state: RequestState) -> List[Dict[str, Any]]:
40
+ if request_state.result is None:
41
+ raise ValueError("Annotation requires a result")
42
+ annotation_values: List[str] = [completion.text.upper() for completion in request_state.result.completions]
43
+ return [{"all_caps": value} for value in annotation_values]
@@ -0,0 +1,61 @@
1
+ import os
2
+ from typing import Any, Dict, Mapping, Optional
3
+
4
+ from helm.common.credentials_utils import provide_api_key
5
+ from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
6
+ from helm.common.hierarchical_logger import hlog
7
+ from helm.common.object_spec import create_object, inject_object_spec_args
8
+ from helm.benchmark.annotation.annotator import Annotator, AnnotatorSpec
9
+
10
+
11
+ class AnnotatorFactory:
12
+ """Factory for creating annotators."""
13
+
14
+ def __init__(
15
+ self, credentials: Mapping[str, Any], file_storage_path: str, cache_backend_config: CacheBackendConfig
16
+ ):
17
+ self.credentials = credentials
18
+ self.file_storage_path = file_storage_path
19
+ self.cache_backend_config = cache_backend_config
20
+ hlog(f"AnnotatorFactory: file_storage_path = {file_storage_path}")
21
+ hlog(f"AnnotatorFactory: cache_backend_config = {cache_backend_config}")
22
+
23
+ # Cache for annotators
24
+ # This is used to prevent duplicate creation of annotators
25
+ # It is especially important as annotation is a multi-threaded
26
+ # process and creating a new annotator for each request can cause
27
+ # race conditions.
28
+ self.annotators: Dict[str, Annotator] = {}
29
+
30
+ def get_annotator(self, annotator_spec: AnnotatorSpec) -> Annotator:
31
+ """Return a annotator based on the name."""
32
+ # First try to find the annotator in the cache
33
+ assert annotator_spec.args is None or annotator_spec.args == {}
34
+ annotator_name: str = annotator_spec.class_name.split(".")[-1].lower().replace("annotator", "")
35
+ annotator: Optional[Annotator] = self.annotators.get(annotator_name)
36
+ if annotator is not None:
37
+ return annotator
38
+
39
+ # Otherwise, create the client
40
+ cache_config: CacheConfig = self.cache_backend_config.get_cache_config(annotator_name)
41
+ annotator_spec = inject_object_spec_args(
42
+ annotator_spec,
43
+ constant_bindings={
44
+ "cache_config": cache_config,
45
+ },
46
+ provider_bindings={
47
+ "api_key": lambda: provide_api_key(self.credentials, annotator_name),
48
+ "file_storage_path": lambda: self._get_file_storage_path(annotator_name),
49
+ },
50
+ )
51
+ annotator = create_object(annotator_spec)
52
+
53
+ # Cache the client
54
+ self.annotators[annotator_name] = annotator
55
+
56
+ return annotator
57
+
58
+ def _get_file_storage_path(self, annotator_name: str) -> str:
59
+ # Returns the path to use for a local file cache for the given annotator
60
+ local_file_cache_path: str = os.path.join(self.file_storage_path, "output", annotator_name)
61
+ return local_file_cache_path