crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -2,9 +2,13 @@ from typing import Optional
2
2
 
3
3
  from helm.common.authentication import Authentication
4
4
  from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
5
+ from helm.common.file_upload_request import FileUploadResult, FileUploadRequest
6
+ from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
7
+ from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
5
8
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
6
9
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
7
10
  from helm.proxy.services.service import Service
11
+ from helm.common.cache import Cache
8
12
 
9
13
 
10
14
  class MetricService(TokenizerService):
@@ -15,8 +19,20 @@ class MetricService(TokenizerService):
15
19
  def __init__(self, service: Service, auth: Authentication):
16
20
  super().__init__(service, auth)
17
21
 
22
+ def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
23
+ return self._service.check_nudity(self._auth, request)
24
+
25
+ def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
26
+ return self._service.compute_clip_score(self._auth, request)
27
+
28
+ def upload(self, request: FileUploadRequest) -> FileUploadResult:
29
+ return self._service.upload(self._auth, request)
30
+
18
31
  def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
19
32
  return self._service.get_toxicity_scores(self._auth, request)
20
33
 
21
34
  def make_critique_request(self, request: CritiqueRequest) -> Optional[CritiqueRequestResult]:
22
35
  return self._service.make_critique_request(self._auth, request)
36
+
37
+ def get_cache(self, shard_name: str) -> Cache:
38
+ return Cache(self._service.get_cache_config(shard_name))
@@ -1,13 +1,13 @@
1
1
  from typing import List
2
2
 
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
- from .metric import Metric
4
+ from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
5
5
  from .metric_name import MetricName
6
6
  from .statistic import Stat
7
7
  from nltk.translate.bleu_score import corpus_bleu
8
8
 
9
9
 
10
- class CLEVAParaphraseGenerationMetric(Metric):
10
+ class CLEVAParaphraseGenerationMetric(EvaluateInstancesMetric):
11
11
  """
12
12
  Compute the Chinese iBLEU score for Paraphrase Generation scenarios of CLEVA benchmark.
13
13
  This implementation allows variable number of references (i.e., golds).
@@ -21,8 +21,7 @@ class CLEVAParaphraseGenerationMetric(Metric):
21
21
  def __init__(self, alpha: float = 0.8): # calculate iBLEU_0.8 by default
22
22
  self.alpha = alpha
23
23
 
24
- def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
25
-
24
+ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
26
25
  inputs: List = []
27
26
  preds: List = []
28
27
  golds: List[List[str]] = []
@@ -4,11 +4,11 @@ from typing import Callable, Dict, List, Tuple, Optional
4
4
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_RANKING_BINARY
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
6
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
7
+ from helm.benchmark.metrics.reference_metric import ReferenceMetric
7
8
  from helm.common.optional_dependencies import handle_module_not_found_error
8
9
  from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference
9
10
  from helm.common.request import RequestResult
10
- from helm.common.general import binarize_dict
11
- from .metric import Metric
11
+ from helm.common.general import assert_present, binarize_dict
12
12
  from .metric_name import MetricName
13
13
  from .metric_service import MetricService
14
14
  from .statistic import Stat
@@ -58,7 +58,7 @@ class RankingObject:
58
58
  model_relevance: Optional[int] = None
59
59
 
60
60
 
61
- class RankingMetric(Metric):
61
+ class RankingMetric(ReferenceMetric):
62
62
  """Ranking metric."""
63
63
 
64
64
  """ Methods supported by this metric.
@@ -205,14 +205,13 @@ class RankingMetric(Metric):
205
205
  relevance dictionary, which contains the ground truth relevance
206
206
  values for each document.
207
207
  """
208
- assert all([r.model_relevance is not None for r in ranking_objs])
209
208
  if rank_limit:
210
209
  return {
211
- self.get_query_string(r.reference_index): r.model_relevance # type: ignore
210
+ self.get_query_string(r.reference_index): assert_present(r.model_relevance)
212
211
  for r in ranking_objs
213
212
  if r.rank and r.rank <= rank_limit
214
213
  }
215
- return {self.get_query_string(r.reference_index): r.model_relevance for r in ranking_objs} # type: ignore
214
+ return {self.get_query_string(r.reference_index): assert_present(r.model_relevance) for r in ranking_objs}
216
215
 
217
216
  def get_true_relevances(self, ranking_objects: List[RankingObject]) -> Dict[str, int]:
218
217
  """Get the true relevance dictionary."""
@@ -372,7 +371,7 @@ class RankingMetric(Metric):
372
371
  # len(ranking_objects) minus its relevance.
373
372
  stats += [
374
373
  Stat(MetricName(f"ref{r.reference_index}_rank")).add(
375
- len(ranking_objects) - r.model_relevance # type: ignore
374
+ len(ranking_objects) - assert_present(r.model_relevance)
376
375
  )
377
376
  for r in ranking_objects
378
377
  ]
@@ -0,0 +1,148 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import List, Dict
4
+ from helm.benchmark.metrics.metric import (
5
+ MetricInterface,
6
+ MetricResult,
7
+ PerInstanceStats,
8
+ add_context,
9
+ compute_worst_case_metrics,
10
+ )
11
+
12
+ from helm.common.general import parallel_map
13
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_LANGUAGE_MODELING
14
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
15
+ from helm.benchmark.adaptation.request_state import RequestState
16
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
17
+ from .metric_name import MetricName, MetricContext
18
+ from .metric_service import MetricService
19
+ from .statistic import Stat, merge_stat
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class Processor:
24
+ """Evaluates an instance."""
25
+
26
+ # TODO: not ideal that we have circular dependencies; subclasses of Metric
27
+ # should override the Processor rather than the Metric.
28
+ metric: "ReferenceMetric"
29
+ metric_service: MetricService
30
+ eval_cache_path: str
31
+ adapter_spec: AdapterSpec
32
+
33
+ def process(self, references_states: List[RequestState]) -> List[Stat]:
34
+ instance_stats: List[Stat] = []
35
+
36
+ # Evaluate the references
37
+ if len(references_states) == 0:
38
+ return instance_stats
39
+ instance_stats.extend(
40
+ self.metric.evaluate_references(
41
+ self.adapter_spec, references_states, self.metric_service, self.eval_cache_path
42
+ )
43
+ )
44
+
45
+ # Add instance-related context (e.g., split, perturbation) to the metrics
46
+ for i, stat in enumerate(instance_stats):
47
+ instance_stats[i] = add_context(stat, MetricContext.from_instance(references_states[0].instance))
48
+
49
+ return instance_stats
50
+
51
+
52
+ class ReferenceMetric(MetricInterface, ABC):
53
+ """
54
+ A `Metric` takes the results of execution and produces `Stat`s for a
55
+ scenario.
56
+
57
+ Note: `Metric` actually right now is a bit of misnomer because it produces many
58
+ `Stat`s, that might be distinct but are computed together. Eventually we
59
+ might move to a world where there is one (or very few metrics that are domain-independent).
60
+ """
61
+
62
+ def evaluate(
63
+ self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
64
+ ) -> MetricResult:
65
+ """
66
+ Main entry point for a `Metric`. This function groups the single
67
+ list of `RequestState` by training trial and instance, and invokes
68
+ other functions to process those. This should serve most purposes.
69
+
70
+ Any logic that doesn't decompose along instances should go here, such
71
+ as robustness.
72
+ """
73
+ assert scenario_state.adapter_spec.method != ADAPT_LANGUAGE_MODELING, (
74
+ "Metric no longer knows how to handle the language modeling adapter. "
75
+ + "All run_specs with that adapter should use LanguageModelingMetric. "
76
+ + "If you are seeing this issue, please file a Github issue."
77
+ )
78
+
79
+ adapter_spec = scenario_state.adapter_spec
80
+ global_stats: Dict[MetricName, Stat] = {}
81
+ all_per_instance_stats: List[PerInstanceStats] = []
82
+
83
+ for train_trial_index in range(adapter_spec.num_train_trials):
84
+ # Construct inputs
85
+ request_state_sets: List[List[RequestState]] = []
86
+ for instance in scenario_state.instances:
87
+ references_states = []
88
+ for reference_index in range(len(instance.references)):
89
+ references_states.extend(
90
+ scenario_state.get_request_states(train_trial_index, instance, reference_index)
91
+ )
92
+ request_state_sets.append(references_states)
93
+
94
+ # Do it!
95
+ processor = Processor(
96
+ metric=self,
97
+ metric_service=metric_service,
98
+ eval_cache_path=eval_cache_path,
99
+ adapter_spec=scenario_state.adapter_spec,
100
+ )
101
+ results: List[List[Stat]] = parallel_map(
102
+ processor.process,
103
+ request_state_sets,
104
+ parallelism=parallelism,
105
+ )
106
+
107
+ # Compute per-instance stats
108
+ per_instance_stats: List[PerInstanceStats] = []
109
+ for instance, stats in zip(scenario_state.instances, results):
110
+ assert instance.id is not None, f"id was none for instance: {instance}"
111
+ # Sometimes a metric (e.g., BiasMetric) doesn't produce any statistics
112
+ if len(stats) > 0:
113
+ per_instance_stats.append(
114
+ PerInstanceStats(instance.id, instance.perturbation, train_trial_index, stats)
115
+ )
116
+
117
+ # Aggregate these stats
118
+ trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial
119
+ for instance_stats in results:
120
+ for stat in instance_stats:
121
+ merge_stat(trial_stats, stat)
122
+
123
+ # Compute worst-case metrics.
124
+ # This is here since we want these stats for all metrics and they
125
+ # aggregate across contexts (perturbations).
126
+ worst_case_stats = compute_worst_case_metrics(dict(zip(scenario_state.instances, results)))
127
+ for stat in worst_case_stats:
128
+ merge_stat(trial_stats, stat)
129
+
130
+ # We take the mean value for each trial.
131
+ for stat in trial_stats.values():
132
+ merge_stat(global_stats, stat.take_mean())
133
+
134
+ all_per_instance_stats.extend(per_instance_stats)
135
+
136
+ # Wrap aggregated and per-instance stats in a MetricResult.
137
+ return MetricResult(list(global_stats.values()), all_per_instance_stats)
138
+
139
+ @abstractmethod
140
+ def evaluate_references(
141
+ self,
142
+ adapter_spec: AdapterSpec,
143
+ reference_request_states: List[RequestState],
144
+ metric_service: MetricService,
145
+ eval_cache_path: str,
146
+ ) -> List[Stat]:
147
+ """Evaluate the references. Override me!"""
148
+ pass
@@ -50,7 +50,6 @@ class SummaCImager:
50
50
  def __init__(
51
51
  self, model_name="mnli", granularity="paragraph", use_cache=True, max_doc_sents=100, device="cuda", **kwargs
52
52
  ):
53
-
54
53
  self.grans = granularity.split("-")
55
54
 
56
55
  assert (
@@ -154,7 +153,6 @@ class SummaCImager:
154
153
  for j in range(N_gen)
155
154
  ]
156
155
  for batch in utils_misc.batcher(dataset, batch_size=20):
157
-
158
156
  if self.model_name == "decomp":
159
157
  batch_evids, batch_conts, batch_neuts = [], [], []
160
158
  batch_json = [{"premise": d["premise"], "hypothesis": d["hypothesis"]} for d in batch]
@@ -5,19 +5,20 @@ import os
5
5
  import pickle
6
6
 
7
7
  import spacy
8
+ import spacy.cli
8
9
  from typing import List, Dict, Optional
9
10
  from collections import defaultdict
10
11
 
11
12
  from helm.benchmark.adaptation.scenario_state import ScenarioState
12
13
  from helm.benchmark.adaptation.request_state import RequestState
13
14
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
15
+ from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
14
16
  from helm.common.hierarchical_logger import hlog
15
17
  from helm.common.general import ensure_file_downloaded
16
18
  from helm.common.optional_dependencies import handle_module_not_found_error
17
19
  from .metric import Metric, MetricResult
18
20
  from .metric_name import MetricName
19
21
  from .metric_service import MetricService
20
- from .basic_metrics import get_rouge_function
21
22
  from .statistic import Stat
22
23
  from .summac.model_summac import SummaCZS
23
24
  from bert_score import BERTScorer
@@ -52,12 +53,12 @@ class SummarizationMetric(Metric):
52
53
  # avoid triggering a bug in DataStatsMetric that raises
53
54
  # `NameError: name 'stderr' is not defined`
54
55
  if not spacy.util.is_package("en_core_web_sm"):
55
- spacy.cli.download("en_core_web_sm") # type: ignore
56
+ spacy.cli.download("en_core_web_sm")
56
57
 
57
58
  try:
58
59
  from summ_eval.data_stats_metric import DataStatsMetric
59
60
  except ModuleNotFoundError as e:
60
- handle_module_not_found_error(e, ["metrics"])
61
+ handle_module_not_found_error(e, ["summarization"])
61
62
 
62
63
  self.data_stats_metric = DataStatsMetric()
63
64
  self.task: str = task
@@ -168,7 +169,6 @@ class SummarizationMetric(Metric):
168
169
  metric_service: MetricService,
169
170
  eval_cache_path: str,
170
171
  ) -> List[Stat]:
171
-
172
172
  refs: List[str] = [self._remove_braces(ref.output.text) for ref in request_state.instance.references]
173
173
  inp: str = self._remove_braces(request_state.instance.input.text)
174
174
 
@@ -182,9 +182,9 @@ class SummarizationMetric(Metric):
182
182
  self.humaneval = self._load_humaneval(eval_cache_path)
183
183
 
184
184
  # get human evaluation scores if they exist
185
- model_name = adapter_spec.model.replace("/", "_")
185
+ deployment = adapter_spec.model_deployment.replace("/", "_")
186
186
  for metric_name in ["faithfulness", "relevance", "coherence"]:
187
- val = self.humaneval[(metric_name, model_name, request_state.instance.id, pred)]
187
+ val = self.humaneval[(metric_name, deployment, request_state.instance.id, pred)]
188
188
  result.append(Stat(MetricName(f"HumanEval-{metric_name}")).add(float(val)))
189
189
  except KeyError:
190
190
  pass
@@ -196,8 +196,8 @@ class SummarizationMetric(Metric):
196
196
  if self.qa_fact_eval is None:
197
197
  self._load_qafacteval(eval_cache_path)
198
198
  assert self.qa_fact_eval is not None
199
- model_name = adapter_spec.model.replace("/", "_")
200
- val = self.qa_fact_eval[model_name][(request_state.instance.id, pred)]
199
+ deployment = adapter_spec.model_deployment.replace("/", "_")
200
+ val = self.qa_fact_eval[deployment][(request_state.instance.id, pred)]
201
201
  result.append(Stat(MetricName("QAFactEval")).add(float(val)))
202
202
  except KeyError:
203
203
  pass
@@ -7,7 +7,7 @@ from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.metrics.classification_metrics import ClassificationMetric
8
8
  from helm.benchmark.metrics.statistic import Stat
9
9
  from helm.benchmark.scenarios.scenario import Input, Instance, Output, Reference, CORRECT_TAG
10
- from helm.common.request import Request, RequestResult, Sequence
10
+ from helm.common.request import Request, RequestResult, GeneratedOutput
11
11
 
12
12
 
13
13
  class _Option(NamedTuple):
@@ -26,9 +26,12 @@ def _request_state(prediction: str, options: List[_Option]):
26
26
  request_mode=None,
27
27
  train_trial_index=0,
28
28
  output_mapping=None,
29
- request=Request(),
29
+ request=Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002"),
30
30
  result=RequestResult(
31
- success=True, embedding=[], completions=[Sequence(text=prediction, logprob=0.0, tokens=[])], cached=False
31
+ success=True,
32
+ embedding=[],
33
+ completions=[GeneratedOutput(text=prediction, logprob=0.0, tokens=[])],
34
+ cached=False,
32
35
  ),
33
36
  num_train_instances=0,
34
37
  prompt_truncated=False,
@@ -76,7 +79,7 @@ def test_evaluate_instances_binary_generation():
76
79
  ]
77
80
 
78
81
  assert_stats_equal(
79
- metric.evaluate_instances(request_states),
82
+ metric.evaluate_instances(request_states, ""),
80
83
  _expected_stats(
81
84
  {
82
85
  "yes": {"tp": 3, "fp": 1, "tn": 2, "fn": 1},
@@ -106,7 +109,7 @@ def test_evaluate_instances_multi_class():
106
109
  _request_state("invalid", _options("c")),
107
110
  ]
108
111
  assert_stats_equal(
109
- metric.evaluate_instances(request_states),
112
+ metric.evaluate_instances(request_states, ""),
110
113
  _expected_stats(
111
114
  {
112
115
  "d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
@@ -139,7 +142,7 @@ def test_evaluate_instances_multilabel():
139
142
  ]
140
143
 
141
144
  assert_stats_equal(
142
- metric.evaluate_instances(request_states),
145
+ metric.evaluate_instances(request_states, ""),
143
146
  _expected_stats(
144
147
  {
145
148
  "d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
@@ -0,0 +1,78 @@
1
+ # Test metrics
2
+ from typing import List
3
+
4
+ import numpy as np
5
+ import pytest
6
+ from helm.benchmark.metrics.disinformation_metrics import _monte_carlo_entropy, _self_bleu
7
+ from helm.common.request import GeneratedOutput, Token
8
+
9
+ # Test tokens
10
+ _TEST_1_TOKENS: List[Token] = [
11
+ Token("This", logprob=-0.25),
12
+ Token("is", logprob=-0.25),
13
+ Token("a", logprob=-0.25),
14
+ Token("test", logprob=-0.25),
15
+ ]
16
+ _TEST_2_TOKENS: List[Token] = [
17
+ Token("This", logprob=-0.25),
18
+ Token("is", logprob=-0.25),
19
+ Token("another", logprob=-0.5),
20
+ Token("test", logprob=-0.25),
21
+ ]
22
+ _TEST_EMPTY_TOKENS: List[Token] = []
23
+ test_empty_str_tokens: List[Token] = [
24
+ Token("", logprob=0),
25
+ ]
26
+
27
+ # Test Sequences (two standard, one with an empty token, and one with no tokens)
28
+ _TEST_1 = GeneratedOutput(text="This is a test", logprob=-1, tokens=_TEST_1_TOKENS)
29
+ _TEST_2 = GeneratedOutput(text="This is another test", logprob=-1.25, tokens=_TEST_2_TOKENS)
30
+ _TEST_EMPTY = GeneratedOutput(text="", logprob=-float("nan"), tokens=_TEST_EMPTY_TOKENS)
31
+ _TEST_EMPTY_STR = GeneratedOutput(text="", logprob=0, tokens=test_empty_str_tokens)
32
+
33
+
34
+ # Test Self-BLEU
35
+ def test_self_bleu_with_self():
36
+ score = _self_bleu([_TEST_1, _TEST_1])
37
+ assert score == pytest.approx(100)
38
+
39
+
40
+ def test_self_blue_with_other():
41
+ score = _self_bleu([_TEST_1, _TEST_2])
42
+ assert 0 < score < 100
43
+
44
+
45
+ def test_self_blue_one_sequence():
46
+ score = _self_bleu([_TEST_1])
47
+ assert score == 0
48
+
49
+
50
+ def test_self_blue_one_full_one_empty():
51
+ score = _self_bleu([_TEST_1, _TEST_EMPTY_STR])
52
+ assert score == 0
53
+
54
+
55
+ # Test MC Entropy
56
+ def test_mc_entropy_with_self():
57
+ score = _monte_carlo_entropy([_TEST_1, _TEST_1])
58
+ assert score == pytest.approx(-_TEST_1.logprob)
59
+
60
+
61
+ def test_mc_entropy_with_other():
62
+ score = _monte_carlo_entropy([_TEST_1, _TEST_2])
63
+ assert score == pytest.approx(-(_TEST_1.logprob + _TEST_2.logprob) / 2)
64
+
65
+
66
+ def test_mc_entropy_one_sequence():
67
+ score = _monte_carlo_entropy([_TEST_1])
68
+ assert score == -_TEST_1.logprob
69
+
70
+
71
+ def test_mc_entropy_one_full_one_empty():
72
+ score = _monte_carlo_entropy([_TEST_EMPTY_STR])
73
+ assert score == _TEST_EMPTY_STR.logprob
74
+
75
+
76
+ def test_mc_entropy_with_no_tokens():
77
+ score = _monte_carlo_entropy([_TEST_EMPTY])
78
+ assert np.isnan(score)
@@ -0,0 +1,30 @@
1
+ from helm.benchmark.metrics.evaluate_reference_metrics import (
2
+ exact_match,
3
+ exact_match_indicator,
4
+ final_number_exact_match,
5
+ )
6
+
7
+
8
+ def test_exact_match():
9
+ assert exact_match("33", "33") == 1
10
+ assert exact_match("33", "33 ") == 1
11
+ assert exact_match("33", "34") == 0
12
+
13
+
14
+ def test_exact_match_indicator():
15
+ assert exact_match_indicator("33", "33") == 1
16
+ assert exact_match_indicator("33", "stuff 33") == 1
17
+ assert exact_match_indicator("stuff 33", "33") == 1
18
+ assert exact_match_indicator("33", "33 stuff") == 0
19
+
20
+
21
+ def test_final_number_exact_match():
22
+ assert final_number_exact_match("33", "33") == 1
23
+ assert final_number_exact_match("33", "33 eggs.") == 1
24
+ assert final_number_exact_match("33", "-33") == 0
25
+ assert final_number_exact_match("-33", "-33") == 1
26
+ assert final_number_exact_match("The answer is 33", "\\boxed{33}") == 1
27
+ assert final_number_exact_match("The answer is 33", "\\boxed{33} and 34") == 0
28
+ assert final_number_exact_match("34.2", "2") == 0
29
+ assert final_number_exact_match("342", "342.") == 1
30
+ assert final_number_exact_match("3,420", "3420") == 1
@@ -3,13 +3,13 @@ from .basic_metrics import get_num_bytes, convert_tokens_to_text
3
3
 
4
4
 
5
5
  def test_get_num_bytes():
6
- tokens = [Token(text, 0, {}) for text in ["bytes:\\x99", "Hello", " world", "bytes:\\xe2\\x80"]]
6
+ tokens = [Token(text, 0) for text in ["bytes:\\x99", "Hello", " world", "bytes:\\xe2\\x80"]]
7
7
  assert get_num_bytes(tokens) == 14
8
8
 
9
9
 
10
10
  def test_convert_tokens_to_text():
11
11
  tokens = [
12
- Token(text, 0, {})
12
+ Token(text, 0)
13
13
  for text in [
14
14
  "<|endoftext|>",
15
15
  "bytes:\\xe2\\x80",
@@ -39,5 +39,5 @@ class AutoTokenCostEstimator(TokenCostEstimator):
39
39
  """
40
40
  Estimate the number of tokens for a given request based on the organization.
41
41
  """
42
- token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_organization)
42
+ token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_host)
43
43
  return token_cost_estimator.estimate_tokens(request, metric_service)
@@ -2,11 +2,19 @@ from helm.benchmark.metrics.metric_service import MetricService
2
2
  from helm.benchmark.window_services.window_service import WindowService
3
3
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
4
4
  from helm.common.request import Request
5
- from helm.proxy.token_counters.gooseai_token_counter import GooseAITokenCounter
6
5
  from .token_cost_estimator import TokenCostEstimator
7
6
 
8
7
 
9
8
  class GooseAITokenCostEstimator(TokenCostEstimator):
9
+ # From https://goose.ai/pricing: "the base price includes your first 25 tokens
10
+ # generated, and you can scale beyond that on a per-token basis."
11
+ BASE_PRICE_TOKENS: int = 25
12
+
13
+ @staticmethod
14
+ def account_for_base_tokens(num_tokens: int):
15
+ """Subtracts the number of tokens included in the base price."""
16
+ return max(num_tokens - GooseAITokenCostEstimator.BASE_PRICE_TOKENS, 0)
17
+
10
18
  def estimate_tokens(self, request: Request, metric_service: MetricService) -> int:
11
19
  """
12
20
  Estimate the number of generated tokens for a given request. Formula:
@@ -17,6 +25,8 @@ class GooseAITokenCostEstimator(TokenCostEstimator):
17
25
  """
18
26
  total_estimated_tokens: int = request.num_completions * request.max_tokens
19
27
  if request.echo_prompt:
20
- window_service: WindowService = WindowServiceFactory.get_window_service(request.model, metric_service)
28
+ window_service: WindowService = WindowServiceFactory.get_window_service(
29
+ request.model_deployment, metric_service
30
+ )
21
31
  total_estimated_tokens += window_service.get_num_tokens(request.prompt)
22
- return GooseAITokenCounter.account_for_base_tokens(total_estimated_tokens)
32
+ return GooseAITokenCostEstimator.account_for_base_tokens(total_estimated_tokens)
@@ -15,7 +15,7 @@ class OpenAITokenCostEstimator(TokenCostEstimator):
15
15
 
16
16
  Add num_tokens(prompt) if Request.echo_prompt is True.
17
17
  """
18
- tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model, metric_service)
18
+ tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model_deployment, metric_service)
19
19
  num_prompt_tokens: int = tokenizer.get_num_tokens(request.prompt)
20
20
  total_estimated_tokens: int = num_prompt_tokens + request.num_completions * request.max_tokens
21
21
 
@@ -10,6 +10,8 @@ class TestAI21TokenCostEstimator:
10
10
 
11
11
  def test_estimate_tokens(self):
12
12
  request = Request(
13
+ model="openai/text-davinci-002",
14
+ model_deployment="openai/text-davinci-002",
13
15
  prompt="The Center for Research on Foundation Models (CRFM) is "
14
16
  "an interdisciplinary initiative born out of the Stanford "
15
17
  "Institute for Human-Centered Artificial Intelligence (HAI) "
@@ -12,7 +12,6 @@ from .openai_token_cost_estimator import OpenAITokenCostEstimator
12
12
 
13
13
 
14
14
  class TestOpenAITokenCostEstimator:
15
-
16
15
  # The following prompt has 51 tokens according to the GPT-2 tokenizer
17
16
  TEST_PROMPT: str = (
18
17
  "The Center for Research on Foundation Models (CRFM) is "
@@ -37,13 +36,21 @@ class TestOpenAITokenCostEstimator:
37
36
  self._mock_metric_service.tokenize = MagicMock(return_value=tokenization_request_result)
38
37
 
39
38
  def test_estimate_tokens(self):
40
- request = Request(prompt=TestOpenAITokenCostEstimator.TEST_PROMPT, num_completions=3, max_tokens=100)
39
+ request = Request(
40
+ model="openai/text-davinci-002",
41
+ model_deployment="openai/text-davinci-002",
42
+ prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
43
+ num_completions=3,
44
+ max_tokens=100,
45
+ )
41
46
 
42
47
  # Prompt + max number of tokens from completions = 51 + 3 * 100
43
48
  assert self._token_cost_estimator.estimate_tokens(request, self._mock_metric_service) == 51 + 3 * 100
44
49
 
45
50
  def test_estimate_tokens_with_echo_prompt(self):
46
51
  request = Request(
52
+ model="openai/text-davinci-002",
53
+ model_deployment="openai/text-davinci-002",
47
54
  prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
48
55
  echo_prompt=True,
49
56
  num_completions=1,
@@ -5,7 +5,7 @@ from helm.common.request import RequestResult
5
5
  from helm.common.hierarchical_logger import hlog
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
- from helm.proxy.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
8
+ from helm.clients.perspective_api_client import PerspectiveAPIClientCredentialsError
9
9
  from .metric import Metric
10
10
  from .metric_name import MetricName
11
11
  from .metric_service import MetricService
@@ -0,0 +1,23 @@
1
+ from helm.benchmark.metrics.metric_service import MetricService
2
+ from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest, ToxicityAttributes
3
+
4
+
5
+ def compute_toxicity_score(text: str, metric_service: MetricService) -> float:
6
+ """
7
+ Compute the toxicity score of a given text using Perspective API.
8
+ Returns a value between 0 and 1 where a score 0.5 or greater is deemed toxic.
9
+ """
10
+ response: PerspectiveAPIRequestResult = metric_service.get_toxicity_scores(
11
+ request=PerspectiveAPIRequest(text_batch=[text])
12
+ )
13
+ attributes: ToxicityAttributes = response.text_to_toxicity_attributes[text]
14
+ assert attributes.toxicity_score is not None
15
+ return attributes.toxicity_score
16
+
17
+
18
+ def is_prompt_toxic(text: str, metric_service: MetricService) -> bool:
19
+ """
20
+ Returns True, if the prompt is considered toxic, False otherwise.
21
+ """
22
+ score: float = compute_toxicity_score(text, metric_service)
23
+ return score >= 0.5