crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,84 @@
1
+ from typing import Tuple, Dict, Any
2
+
3
+ from helm.common.optional_dependencies import handle_module_not_found_error
4
+
5
+ try:
6
+ from selenium import webdriver
7
+ import selenium.common.exceptions
8
+ except ModuleNotFoundError as e:
9
+ handle_module_not_found_error(e, suggestions=["image2structure"])
10
+
11
+
12
+ def init_driver(url: str, resolution: Tuple[int, int] = (1920, 1080)) -> webdriver.Chrome:
13
+ """Initialize the WebDriver
14
+
15
+ Args:
16
+ url (str): The URL of the website. Usually "http://localhost:{port}".
17
+ resolution (tuple[int, int], optional): The resolution of the WebDriver. Defaults to (1920, 1080).
18
+
19
+ Returns:
20
+ webdriver.Chrome: The Chrome WebDriver
21
+ """
22
+ options = webdriver.ChromeOptions()
23
+ options.add_argument(f"--window-size={resolution[0]},{resolution[1]}")
24
+ options.add_argument("--headless") # Optional: run in headless mode
25
+ options.add_argument("--no-sandbox") # Optional: for certain environments
26
+ options.add_argument("--disable-dev-shm-usage") # Optional: overcome limited resource problems
27
+ driver = webdriver.Chrome(options=options)
28
+ driver.get(url)
29
+ return driver
30
+
31
+
32
+ def close_driver(driver: webdriver.Chrome):
33
+ """Close the WebDriver
34
+
35
+ Args:
36
+ driver (webdriver.Chrome): The Chrome WebDriver
37
+ """
38
+ driver.quit()
39
+
40
+
41
+ class ScreenshotOptions:
42
+ """A class to store the parameters for taking a screenshot"""
43
+
44
+ """The resolution of the screenshot"""
45
+ resolution: Tuple[int, int] = (1920, 1080)
46
+
47
+ """The delay between each action in milliseconds"""
48
+ delay_between_each_action_ms: int = 1000
49
+
50
+
51
+ def save_random_screenshot(path: str, port: int, options: ScreenshotOptions = ScreenshotOptions()) -> Dict[str, Any]:
52
+ """Save a screenshot of a random page
53
+
54
+ Args:
55
+ path (str): The path to save the screenshot
56
+ port (int): The port to use for the website.
57
+ options (ScreenshotOptions, optional): The options to use for taking the screenshot.
58
+ Defaults to ScreenshotOptions().
59
+
60
+ Returns:
61
+ infos (Dict[str, Any]): Additional information about the screenshot
62
+
63
+ Raises:
64
+ ValueError: If the path does not end with .png
65
+ """
66
+ if not path.endswith(".png"):
67
+ raise ValueError("The path should end with .png")
68
+
69
+ driver: webdriver.Chrome
70
+ try:
71
+ driver = init_driver(url=f"http://localhost:{port}", resolution=options.resolution)
72
+ except selenium.common.exceptions.WebDriverException as e:
73
+ raise Exception(f"Failed to initialize the driver: {e}")
74
+ except Exception as e:
75
+ raise Exception(f"An unknown error occurred while initializing the driver: {e}")
76
+
77
+ # Extract the HTML of the page
78
+ html = driver.page_source
79
+
80
+ # Take a screenshot of the page
81
+ driver.save_screenshot(path)
82
+ close_driver(driver)
83
+
84
+ return {"html": html}
@@ -0,0 +1,182 @@
1
+ import subprocess
2
+ import os
3
+ import signal
4
+ from typing import Optional
5
+ import time
6
+ import socket
7
+ import threading
8
+
9
+ from helm.common.hierarchical_logger import hlog
10
+
11
+
12
+ class JekyllServer:
13
+ """A class to start and stop a Jekyll server in a separate process."""
14
+
15
+ def __init__(self, repo_path: str, port: int, verbose: bool = False):
16
+ self.repo_path: str = repo_path
17
+ self.verbose: bool = verbose
18
+ self.port: int = port
19
+ self.process: Optional[subprocess.Popen] = None
20
+ self.success: bool = False # Shared flag to indicate if the server started successfully
21
+
22
+ def __del__(self):
23
+ self.stop()
24
+ if JekyllServer.is_port_in_use(self.port):
25
+ if self.verbose:
26
+ hlog(f"Port {self.port} is in use. Attempting to free it.")
27
+ self.kill_process_using_port(self.port)
28
+ if self.verbose:
29
+ hlog("JekyllServer object deleted.")
30
+
31
+ def setup_gemfile(self):
32
+ # Check if Gemfile exists, if not, copy Gemfile.default to Gemfile
33
+ if not os.path.exists(f"{self.repo_path}/Gemfile"):
34
+ default_gemfile_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "Gemfile.default")
35
+ os.system(f"cp {default_gemfile_path} {self.repo_path}/Gemfile")
36
+ if self.verbose:
37
+ hlog("Copied Gemfile.default to Gemfile")
38
+ return
39
+
40
+ # Gemfile exists, check if it has the jekyll gem
41
+ if "jekyll" in open(f"{self.repo_path}/Gemfile").read():
42
+ # TODO: figure out if we need to do anything here
43
+ return
44
+
45
+ # Gemfile exists, but doesn't have jekyll gem
46
+ with open(f"{self.repo_path}/Gemfile", "a") as file:
47
+ file.write('gem "jekyll", "~> 4.3.3"')
48
+ if self.verbose:
49
+ hlog("Added jekyll gem to Gemfile")
50
+
51
+ def setup_config(self):
52
+ # Check if _config.yml exists, if not, copy _config.default.yml to _config.yml
53
+ if not os.path.exists(f"{self.repo_path}/_config.yml"):
54
+ default_config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "_config.default.yml")
55
+ os.system(f"cp {default_config_path} {self.repo_path}/_config.yml")
56
+ if self.verbose:
57
+ hlog("Copied _config.default.yml to _config.yml")
58
+ # Search for line starting with "port:" and replace it with "port: <port>"
59
+ with open(f"{self.repo_path}/_config.yml", "r") as file:
60
+ lines = file.readlines()
61
+ with open(f"{self.repo_path}/_config.yml", "w") as file:
62
+ for line in lines:
63
+ if line.startswith("port"):
64
+ file.write(f"port: {self.port}\n")
65
+ else:
66
+ file.write(line)
67
+
68
+ @staticmethod
69
+ def is_port_in_use(port: int) -> bool:
70
+ """Check if a port is in use on localhost."""
71
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
72
+ return s.connect_ex(("localhost", port)) == 0
73
+
74
+ def kill_process_using_port(self, port: int):
75
+ """Find and kill the process using the specified port."""
76
+ command = f"lsof -ti:{port} | grep '[0-9]' | xargs -r kill -9"
77
+ os.system(command)
78
+ if self.verbose:
79
+ hlog(f"Killed process using port {port}.")
80
+
81
+ def stream_output(self, process: subprocess.Popen):
82
+ """Read from stdout and stderr streams and hlog."""
83
+ assert process.stdout is not None
84
+ assert process.stderr is not None
85
+ while True:
86
+ output = process.stdout.readline()
87
+ if not output:
88
+ err = process.stderr.readline()
89
+ if err:
90
+ decoded_line = err.decode("utf-8").strip()
91
+ if self.verbose:
92
+ hlog(f"\t> \033[91mStderr: {decoded_line}\033[0m")
93
+ self.success = False
94
+ break
95
+ else:
96
+ # No more output
97
+ break
98
+ else:
99
+ decoded_line = output.decode("utf-8").strip()
100
+ if self.verbose:
101
+ hlog(f"\t> Stdout: {decoded_line}")
102
+ if "Server running... press ctrl-c to stop." in decoded_line:
103
+ self.success = True
104
+ break
105
+
106
+ def start(self, timeout: int = 30) -> bool:
107
+ """Start the Jekyll server in a separate process and monitor the output."""
108
+ if JekyllServer.is_port_in_use(self.port):
109
+ if self.verbose:
110
+ hlog(f"Port {self.port} is in use. Attempting to free it.")
111
+ self.kill_process_using_port(self.port)
112
+
113
+ self.setup_gemfile()
114
+ self.setup_config()
115
+ command_install = f"cd {self.repo_path} && bundle install"
116
+ subprocess.run(command_install, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
117
+
118
+ command_serve = f"cd {self.repo_path} && bundle exec jekyll serve --port {self.port}"
119
+ self.process = subprocess.Popen(
120
+ command_serve,
121
+ shell=True,
122
+ stdout=subprocess.PIPE,
123
+ stderr=subprocess.PIPE,
124
+ preexec_fn=os.setsid,
125
+ )
126
+
127
+ # Start thread to read output
128
+ output_thread = threading.Thread(target=self.stream_output, args=(self.process,))
129
+ output_thread.start()
130
+
131
+ # Wait for the thread to complete or timeout
132
+ output_thread.join(timeout=timeout)
133
+
134
+ if output_thread.is_alive():
135
+ # If the thread is still alive after the timeout, the server did not start
136
+ # successfully within the timeout period
137
+ hlog("Timeout reached without detecting server start.")
138
+ self.process.terminate() # Terminate the process if it's still running
139
+ output_thread.join() # Ensure the thread is cleaned up
140
+ return False
141
+ else:
142
+ if self.verbose:
143
+ if self.success:
144
+ hlog("Jekyll server started successfully.")
145
+ else:
146
+ hlog("Jekyll server failed to start.")
147
+ return self.success # Return the success flag
148
+
149
+ def stop(self, timeout=5):
150
+ """Stop the Jekyll server and terminate the process with a timeout.
151
+
152
+ Args:
153
+ timeout (int, optional): Time to wait for the server to gracefully shut down. Defaults to 5 seconds.
154
+ """
155
+ if self.process:
156
+ # Try to terminate the process group gracefully
157
+ os.killpg(os.getpgid(self.process.pid), signal.SIGTERM)
158
+ self.process.terminate()
159
+
160
+ # Wait for the process to end, checking periodically
161
+ try:
162
+ # Wait up to `timeout` seconds for process to terminate
163
+ for _ in range(timeout):
164
+ if self.process.poll() is not None: # Process has terminated
165
+ break
166
+ time.sleep(1) # Wait a bit before checking again
167
+ else:
168
+ # If the process is still alive after the timeout, kill it
169
+ os.killpg(os.getpgid(self.process.pid), signal.SIGKILL)
170
+ self.process.kill()
171
+ self.process.wait() # Wait for process to be killed
172
+ if self.verbose:
173
+ hlog("Jekyll server forcefully stopped.")
174
+ except Exception as e:
175
+ if self.verbose:
176
+ hlog(f"Error stopping the Jekyll server: {e}")
177
+
178
+ self.process = None
179
+ if self.verbose:
180
+ hlog("Jekyll server stopped.")
181
+ elif self.verbose:
182
+ hlog("Jekyll server is not running.")
@@ -0,0 +1,31 @@
1
+ import re
2
+
3
+ from helm.common.optional_dependencies import handle_module_not_found_error
4
+
5
+ try:
6
+ from html2text import HTML2Text
7
+ except ModuleNotFoundError as e:
8
+ handle_module_not_found_error(e, suggestions=["image2structure"])
9
+
10
+
11
+ def convert_html_to_text(handler: HTML2Text, html: str) -> str:
12
+ """Convert HTML to text
13
+
14
+ Args:
15
+ handler (HTML2Text): The HTML2Text handler
16
+ html (str): The HTML to convert
17
+
18
+ Returns:
19
+ str: The text
20
+ """
21
+ text: str = handler.handle(html)
22
+ # Normalize space sequences to a single space globally
23
+ text = re.sub(r" +", " ", text)
24
+ # Replace tabs with a single space
25
+ text = re.sub(r"\t", " ", text)
26
+ # Remove leading and trailing spaces on each line
27
+ text = re.sub(r"^[ \t]+|[ \t]+$", "", text, flags=re.MULTILINE)
28
+ # Remove unnecessary whitespace - multiple empty lines and tabulations
29
+ text = re.sub(r"\n\s*\n", "\n", text)
30
+
31
+ return text.strip()
@@ -0,0 +1,225 @@
1
+ from typing import Dict, List, Any
2
+
3
+ from helm.benchmark.scenarios.scenario import VALID_SPLIT
4
+ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
5
+ Image2StructureScenario,
6
+ PROCESSED,
7
+ )
8
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
9
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
10
+ save_random_screenshot,
11
+ ScreenshotOptions,
12
+ )
13
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
14
+ from helm.common.general import ensure_directory_exists
15
+ from helm.common.optional_dependencies import handle_module_not_found_error
16
+
17
+ try:
18
+ from html2text import HTML2Text
19
+ except ModuleNotFoundError as e:
20
+ handle_module_not_found_error(e, suggestions=["image2structure"])
21
+
22
+
23
+ import base64
24
+ import os
25
+ import threading
26
+ import tarfile
27
+ import shutil
28
+ import time
29
+ import pickle
30
+
31
+
32
+ def extract_repo(base64_encoding: str, repo_path: str) -> None:
33
+ # Decode the base64 string which corresponds to an archive
34
+ # and extract the files to the repo_path
35
+ try:
36
+ archive = base64.b64decode(base64_encoding)
37
+ # Write to .tar file
38
+ with open(os.path.join(repo_path, "repo.tar.gz"), "wb") as f:
39
+ f.write(archive)
40
+ # Extract
41
+ with tarfile.open(os.path.join(repo_path, "repo.tar.gz"), "r:gz") as tar:
42
+ tar.extractall(repo_path)
43
+ except Exception as e:
44
+ raise ValueError(f"Failed to decode and extract the base64 archive: {e}")
45
+
46
+
47
+ def list_assets(repo_path: str, extensions: List[str]) -> List[str]:
48
+ asset_paths: List[str] = []
49
+ for root, _, files in os.walk(repo_path):
50
+ for file in files:
51
+ if file.split(".")[-1].lower() in extensions:
52
+ # Remove repo_path from the file path
53
+ # ignore paths that start with a dot or in _site
54
+ if not root.startswith(os.path.join(repo_path, "_site")) and not root.startswith(
55
+ os.path.join(repo_path, ".")
56
+ ):
57
+ asset_paths.append(os.path.relpath(os.path.join(root, file), repo_path))
58
+ return asset_paths
59
+
60
+
61
+ def serve_and_take_screenshot(
62
+ repo_path: str,
63
+ destination_path: str,
64
+ screenshot_options: ScreenshotOptions = ScreenshotOptions(),
65
+ max_tries: int = 5,
66
+ ) -> Dict[str, Any]:
67
+ # Start the Jekyll server
68
+ # Select a unique port per thread
69
+ port: int = 4000 + int(threading.get_ident()) % 1000
70
+ server = JekyllServer(repo_path, port=port, verbose=False)
71
+ success: bool = server.start()
72
+ if not success:
73
+ # This runs on examples that are not expected to fail
74
+ server.stop()
75
+ raise ValueError(f"Jekyll server failed to start: {repo_path}")
76
+
77
+ # Take a screenshot of a random page
78
+ success = False
79
+ error: Exception
80
+ for _ in range(max_tries):
81
+ try:
82
+ infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
83
+ success = True
84
+ break
85
+ except Exception as e:
86
+ if "net::ERR_CONNECTION_REFUSED" in str(e):
87
+ error = e
88
+ server.stop()
89
+ time.sleep(0.5)
90
+ server.start()
91
+ time.sleep(0.5)
92
+ else:
93
+ # Do not retry
94
+ break
95
+ if not success:
96
+ raise ValueError(f"Failed to take a screenshot: {error}")
97
+
98
+ # Stop the server
99
+ server.stop()
100
+ time.sleep(0.1)
101
+
102
+ return infos
103
+
104
+
105
+ class WebpageScenario(Image2StructureScenario):
106
+ BASE_PROMPT = (
107
+ "Please generate the source code to generate a webpage that looks like this image as much as feasibly possible.\n" # noqa: E501
108
+ "You should output a json object associating each file name with its content.\n\n"
109
+ "Here is a simple example of the expected structure (that does not correspond to the image)."
110
+ " In this example, 3 files are created: index.html, style.css and script.js.\n"
111
+ "[\n"
112
+ " {\n"
113
+ ' "filename": "index.html",\n'
114
+ ' "content": "<!DOCTYPE html>\\n<html>\\n<head>\\n<title>Title of the document</title>\\n</head>\\n<body>\\n\\n<p>Content of the document......</p>\\n\\n</body>\\n</html>"\n' # noqa: E501
115
+ " },\n"
116
+ " {\n"
117
+ ' "filename": "style.css",\n'
118
+ ' "content": "body {\\n background-color: lightblue;\\n}\\nh1 {\\n color: white;\\n text-align: center;\\n}"\n' # noqa: E501
119
+ " },\n"
120
+ " {\n"
121
+ ' "filename": "script.js",\n'
122
+ ' "content": "document.getElementById(\\"demo\\").innerHTML = \\"Hello JavaScript!\\";"\n'
123
+ " }\n"
124
+ "]\n"
125
+ "You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary," # noqa: E501
126
+ " they will be created for you automatically. Try to write some realistic code keeping in mind that is should"
127
+ " look like the image as much as feasibly possible."
128
+ )
129
+
130
+ HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
131
+ SUBSETS = ["css", "html", "javascript"]
132
+ MAX_TRIES: int = 5
133
+ ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
134
+
135
+ name = "image2webpage"
136
+ description = "Evaluate multimodal models on webpage generation to recreate a provided image"
137
+
138
+ def __init__(
139
+ self,
140
+ subset: str,
141
+ recompile_prompt: bool = True,
142
+ split: str = VALID_SPLIT,
143
+ screenshot_options: ScreenshotOptions = ScreenshotOptions(),
144
+ ):
145
+ super().__init__(subset, recompile_prompt, split)
146
+ self._screenshot_options = screenshot_options
147
+ self._html2text = HTML2Text()
148
+ self._html2text.ignore_links = True
149
+
150
+ def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
151
+ """Extract the base64 encoding of the repo from the row and return it."""
152
+ # No need to reprocess if the assets are already saved
153
+ assets_save_path: str = os.path.join(assets_path, str(row["uuid"].replace('"', "")))
154
+ if os.path.exists(assets_save_path):
155
+ try:
156
+ with open(os.path.join(assets_save_path, "assets_paths.pkl"), "rb") as f:
157
+ row["assets_paths"] = pickle.load(f)
158
+ with open(os.path.join(assets_save_path, "assets_names.pkl"), "rb") as f:
159
+ row["assets_names"] = pickle.load(f)
160
+ del row["assets"]
161
+ row["structure"] = PROCESSED
162
+ return row
163
+ except Exception:
164
+ # There was an issue when loading the assets, reprocess
165
+ shutil.rmtree(assets_save_path)
166
+ ensure_directory_exists(assets_save_path)
167
+
168
+ # Structure is a base64 encoding of the repo
169
+ if self._output_path is None:
170
+ raise ValueError("Output path not set")
171
+ repo_path = os.path.join(self._output_path, f"tmp{threading.get_ident()}_{self._subset}")
172
+ ensure_directory_exists(repo_path)
173
+
174
+ # Decode the base64 string which corresponds to an archive
175
+ # and extract the files to the repo_path
176
+ structure: str = row["structure"]
177
+ extract_repo(structure, repo_path)
178
+ row["structure"] = PROCESSED
179
+ row["repo_path"] = repo_path # Stored for cleanup
180
+
181
+ # Process the assets
182
+ asset_paths: List[str] = list_assets(repo_path, self.ASSETS_EXTENSIONS)
183
+ del row["assets"]
184
+ row["assets_paths"] = []
185
+ row["assets_names"] = []
186
+ # Copy each asset to a unique persistent path
187
+ for i, asset_local_path in enumerate(asset_paths):
188
+ asset_name: str = asset_local_path
189
+ asset_dest_path = os.path.join(assets_save_path, f"{i}.{asset_local_path.split('.')[-1]}")
190
+ shutil.copyfile(os.path.join(row["repo_path"], asset_local_path), asset_dest_path)
191
+ row["assets_paths"].append(asset_dest_path)
192
+ row["assets_names"].append(asset_name)
193
+
194
+ # Save both assets_paths and assets_names as files than can be loaded
195
+ with open(os.path.join(assets_save_path, "assets_paths.pkl"), "wb") as f:
196
+ pickle.dump(row["assets_paths"], f)
197
+ with open(os.path.join(assets_save_path, "assets_names.pkl"), "wb") as f:
198
+ pickle.dump(row["assets_names"], f)
199
+
200
+ return row
201
+
202
+ def build_prompt(self, row: Dict[str, Any]) -> str:
203
+ prompt: str = self.BASE_PROMPT
204
+ assert "assets_paths" in row, "No assets paths in the row"
205
+ assert "assets_names" in row, "No assets names in the row"
206
+ assert len(row["assets_paths"]) == len(row["assets_names"])
207
+ if len(row["assets_names"]) > 0:
208
+ prompt += "\nYou have access to the following assets:\n"
209
+ for asset_local_path in row["assets_names"]:
210
+ prompt += f"- {asset_local_path}\n"
211
+ return prompt
212
+
213
+ def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
214
+ # Structure is the path to the repo
215
+ # Serve and take screenshot
216
+ repo_path: str = structure
217
+ infos: Dict[str, Any] = serve_and_take_screenshot(repo_path, destination_path, self._screenshot_options)
218
+ text: str = convert_html_to_text(self._html2text, infos["html"])
219
+ return text
220
+
221
+ def finalize(self, row: Dict[str, Any]) -> None:
222
+ """Perform cleanup operations after the instance has been generated."""
223
+ if "repo_path" in row:
224
+ repo_path: str = row["repo_path"]
225
+ shutil.rmtree(repo_path)
@@ -0,0 +1,124 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ TRAIN_SPLIT,
11
+ Input,
12
+ Instance,
13
+ Output,
14
+ Reference,
15
+ Scenario,
16
+ )
17
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
+ from helm.common.media_object import MediaObject, MultimediaObject
19
+
20
+
21
+ class MementosScenario(Scenario):
22
+ """
23
+ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
24
+
25
+ We introduces Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning abilities. Mementos
26
+ features 4,761 diverse image sequences with varying lengths.
27
+
28
+ @misc{wang2024mementos,
29
+ title={Mementos: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences},
30
+ author={Xiyao Wang and Yuhang Zhou and Xiaoyu Liu and Hongjin Lu and Yuancheng Xu and Feihong He and Jaehong Yoon
31
+ and Taixi Lu and Gedas Bertasius and Mohit Bansal and Huaxiu Yao and Furong Huang},
32
+ year={2024},
33
+ eprint={2401.10529},
34
+ archivePrefix={arXiv},
35
+ primaryClass={cs.CV}
36
+ }
37
+
38
+ Paper: https://arxiv.org/abs/2401.10529
39
+ """
40
+
41
+ MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "shenmishajing/unofficial_mementos_dataset"
42
+
43
+ IMAGE_URL: str = (
44
+ "https://huggingface.co/datasets/shenmishajing/unofficial_mementos_dataset/resolve/main/"
45
+ + "{subject}/{split}/{file_name}?download=true"
46
+ )
47
+
48
+ DATA_FILES: str = "{subject}/{split}/metadata.csv"
49
+
50
+ QUESTION_PROMPT: str = (
51
+ "Write a description for the given image sequence in a single paragraph, what is happening in this episode?"
52
+ )
53
+
54
+ SUBJECTS: List[str] = ["comics", "dailylife", "robotics"]
55
+
56
+ name = "mementos"
57
+ description = (
58
+ "A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
59
+ " ([paper](https://arxiv.org/abs/2401.10529))."
60
+ )
61
+ tags = ["vision-language"]
62
+
63
+ def __init__(self, subject: str):
64
+ super().__init__()
65
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
66
+ self._subject: str = subject
67
+
68
+ def get_instances(self, output_path: str) -> List[Instance]:
69
+ subject_output_path: str = os.path.join(output_path, self._subject)
70
+ ensure_directory_exists(subject_output_path)
71
+
72
+ data_files = {
73
+ split: self.DATA_FILES.format(subject=self._subject, split=split) for split in [TRAIN_SPLIT, TEST_SPLIT]
74
+ }
75
+ instances: List[Instance] = []
76
+
77
+ for split in [TRAIN_SPLIT, TEST_SPLIT]:
78
+ cur_output_path = os.path.join(subject_output_path, split)
79
+ ensure_directory_exists(cur_output_path)
80
+
81
+ # Process the test set
82
+ for row in tqdm(
83
+ load_dataset(
84
+ self.MEMENTOS_HUGGINGFACE_DATASET_NAME.format(subject=self._subject),
85
+ data_files=data_files,
86
+ split=split,
87
+ cache_dir=cur_output_path,
88
+ )
89
+ ):
90
+ # Download the image
91
+ file_name: str = row["file_name"]
92
+ local_image_path: str = os.path.join(cur_output_path, file_name)
93
+ ensure_file_downloaded(
94
+ source_url=self.IMAGE_URL.format(subject=self._subject, split=split, file_name=file_name),
95
+ target_path=local_image_path,
96
+ unpack=False,
97
+ )
98
+
99
+ content: List[MediaObject] = [
100
+ MediaObject(location=local_image_path, content_type="image/png"),
101
+ MediaObject(text=self.QUESTION_PROMPT, content_type="text/plain"),
102
+ ]
103
+ answer: str = row["description"]
104
+ instances.append(
105
+ Instance(
106
+ Input(multimedia_content=MultimediaObject(content)),
107
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
108
+ split=split,
109
+ )
110
+ )
111
+
112
+ print()
113
+
114
+ return instances
115
+
116
+
117
+ def main():
118
+ scenario = MementosScenario("robotics")
119
+ instances = scenario.get_instances("output")
120
+ print(instances)
121
+
122
+
123
+ if __name__ == "__main__":
124
+ main()