evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -21,7 +21,7 @@ import re
21
21
  import string
22
22
  from typing import Dict, Optional, Sequence, Union
23
23
 
24
- from evalscope.benchmarks.ifeval import instructions_util
24
+ from . import instructions_util
25
25
 
26
26
  _InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
27
27
 
@@ -140,8 +140,9 @@ class ResponseLanguageChecker(Instruction):
140
140
  if self._language is None:
141
141
  self._language = random.choice(list(_LANGUAGES.keys()))
142
142
  # TODO(tianjianlu): opens the description generation to more choices.
143
- self._description_pattern = ('Your ENTIRE response should be in {language} language, no other '
144
- + 'language is allowed.')
143
+ self._description_pattern = (
144
+ 'Your ENTIRE response should be in {language} language, no other ' + 'language is allowed.'
145
+ )
145
146
  return self._description_pattern.format(language=_LANGUAGES[self._language])
146
147
 
147
148
  def get_instruction_args(self):
@@ -197,8 +198,10 @@ class NumberOfSentences(Instruction):
197
198
  if relation is None:
198
199
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
199
200
  elif relation not in _COMPARISON_RELATION:
200
- raise ValueError('The supported relation for comparison must be in '
201
- f'{_COMPARISON_RELATION}, but {relation} is given.')
201
+ raise ValueError(
202
+ 'The supported relation for comparison must be in '
203
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
204
+ )
202
205
  else:
203
206
  self._comparison_relation = relation
204
207
 
@@ -255,8 +258,10 @@ class PlaceholderChecker(Instruction):
255
258
  self._num_placeholders = num_placeholders
256
259
  if self._num_placeholders is None or self._num_placeholders < 0:
257
260
  self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
258
- self._description_pattern = ('The response must contain at least {num_placeholders} placeholders '
259
- + 'represented by square brackets, such as [address].')
261
+ self._description_pattern = (
262
+ 'The response must contain at least {num_placeholders} placeholders '
263
+ + 'represented by square brackets, such as [address].'
264
+ )
260
265
  return self._description_pattern.format(num_placeholders=self._num_placeholders)
261
266
 
262
267
  def get_instruction_args(self):
@@ -298,9 +303,10 @@ class BulletListChecker(Instruction):
298
303
  self._num_bullets = num_bullets
299
304
  if self._num_bullets is None or self._num_bullets < 0:
300
305
  self._num_bullets = random.randint(1, _NUM_BULLETS)
301
- self._description_pattern = ('Your answer must contain exactly {num_bullets} bullet points. '
302
- + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n'
303
- + '* This is point 2')
306
+ self._description_pattern = (
307
+ 'Your answer must contain exactly {num_bullets} bullet points. '
308
+ + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n' + '* This is point 2'
309
+ )
304
310
  return self._description_pattern.format(num_bullets=self._num_bullets)
305
311
 
306
312
  def get_instruction_args(self):
@@ -379,8 +385,9 @@ class ConstrainedStartChecker(Instruction):
379
385
  self._starter = starter.strip() if isinstance(starter, str) else starter
380
386
  if self._starter is None:
381
387
  self._starter = random.choice(_STARTER_OPTIONS)
382
- self._description_pattern = ('During the conversation, when it is your turn, '
383
- + 'please always start with {starter}')
388
+ self._description_pattern = (
389
+ 'During the conversation, when it is your turn, ' + 'please always start with {starter}'
390
+ )
384
391
  return self._description_pattern.format(starter=self._starter)
385
392
 
386
393
  def get_instruction_args(self):
@@ -423,8 +430,10 @@ class HighlightSectionChecker(Instruction):
423
430
  if self._num_highlights is None or self._num_highlights < 0:
424
431
  self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
425
432
 
426
- self._description_pattern = ('Highlight at least {num_highlights} sections in your answer with '
427
- + 'markdown, i.e. *highlighted section*.')
433
+ self._description_pattern = (
434
+ 'Highlight at least {num_highlights} sections in your answer with '
435
+ + 'markdown, i.e. *highlighted section*.'
436
+ )
428
437
 
429
438
  return self._description_pattern.format(num_highlights=self._num_highlights)
430
439
 
@@ -482,9 +491,11 @@ class SectionChecker(Instruction):
482
491
  if self._num_sections is None or self._num_sections < 0:
483
492
  self._num_sections = random.randint(1, _NUM_SECTIONS)
484
493
 
485
- self._description_pattern = ('Your response must have {num_sections} sections. Mark the beginning '
486
- + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
487
- + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]')
494
+ self._description_pattern = (
495
+ 'Your response must have {num_sections} sections. Mark the beginning '
496
+ + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
497
+ + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]'
498
+ )
488
499
 
489
500
  return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
490
501
 
@@ -534,8 +545,9 @@ class ParagraphChecker(Instruction):
534
545
  if self._num_paragraphs is None or self._num_paragraphs < 0:
535
546
  self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
536
547
 
537
- self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
538
- + 'Paragraphs are separated with the markdown divider: ***')
548
+ self._description_pattern = (
549
+ 'There should be {num_paragraphs} paragraphs. ' + 'Paragraphs are separated with the markdown divider: ***'
550
+ )
539
551
 
540
552
  return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
541
553
 
@@ -585,12 +597,14 @@ class PostscriptChecker(Instruction):
585
597
  A string representing the instruction description.
586
598
  """
587
599
  self._postscript_marker = (
588
- postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker)
600
+ postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
601
+ )
589
602
  if self._postscript_marker is None:
590
603
  self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
591
604
 
592
- self._description_pattern = ('At the end of your response, please explicitly add a postscript '
593
- + 'starting with {postscript}')
605
+ self._description_pattern = (
606
+ 'At the end of your response, please explicitly add a postscript ' + 'starting with {postscript}'
607
+ )
594
608
 
595
609
  return self._description_pattern.format(postscript=self._postscript_marker)
596
610
 
@@ -644,8 +658,10 @@ class RephraseChecker(Instruction):
644
658
  'in the form of *change me*.')
645
659
 
646
660
  self._reference_without_change = original_message
647
- self._description = ('Rephrasing: Your rephrased response should only'
648
- + 'change the words/sentences in between two asterisks' + 'such as *change me*.')
661
+ self._description = (
662
+ 'Rephrasing: Your rephrased response should only' + 'change the words/sentences in between two asterisks'
663
+ + 'such as *change me*.'
664
+ )
649
665
  return self._description
650
666
 
651
667
  def get_instruction_args(self):
@@ -757,13 +773,16 @@ class KeywordFrequencyChecker(Instruction):
757
773
  if relation is None:
758
774
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
759
775
  elif relation not in _COMPARISON_RELATION:
760
- raise ValueError('The supported relation for comparison must be in '
761
- f'{_COMPARISON_RELATION}, but {relation} is given.')
776
+ raise ValueError(
777
+ 'The supported relation for comparison must be in '
778
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
779
+ )
762
780
  else:
763
781
  self._comparison_relation = relation
764
782
 
765
- self._description_pattern = ('In your response, the word {keyword} should appear {relation} '
766
- + '{frequency} times.')
783
+ self._description_pattern = (
784
+ 'In your response, the word {keyword} should appear {relation} ' + '{frequency} times.'
785
+ )
767
786
 
768
787
  return self._description_pattern.format(
769
788
  keyword=self._keyword,
@@ -819,8 +838,10 @@ class NumberOfWords(Instruction):
819
838
  if relation is None:
820
839
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
821
840
  elif relation not in _COMPARISON_RELATION:
822
- raise ValueError('The supported relation for comparison must be in '
823
- f'{_COMPARISON_RELATION}, but {relation} is given.')
841
+ raise ValueError(
842
+ 'The supported relation for comparison must be in '
843
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
844
+ )
824
845
  else:
825
846
  self._comparison_relation = relation
826
847
 
@@ -850,8 +871,10 @@ class JsonFormat(Instruction):
850
871
  """Check the Json format."""
851
872
 
852
873
  def build_description(self):
853
- self._description_pattern = ('Entire output should be wrapped in JSON format. You can use markdown'
854
- ' ticks such as ```.')
874
+ self._description_pattern = (
875
+ 'Entire output should be wrapped in JSON format. You can use markdown'
876
+ ' ticks such as ```.'
877
+ )
855
878
  return self._description_pattern
856
879
 
857
880
  def get_instruction_args(self):
@@ -864,8 +887,9 @@ class JsonFormat(Instruction):
864
887
 
865
888
  def check_following(self, value):
866
889
  value = (
867
- value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix(
868
- '```').removesuffix('```').strip())
890
+ value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix('```').
891
+ removesuffix('```').strip()
892
+ )
869
893
  try:
870
894
  json.loads(value)
871
895
  except ValueError:
@@ -903,10 +927,12 @@ class ParagraphFirstWordCheck(Instruction):
903
927
  self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
904
928
  self._first_word = self._first_word.lower()
905
929
 
906
- self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
907
- + 'Paragraphs and only paragraphs are separated with each other by two '
908
- + "new lines as if it was '\\n\\n' in python. "
909
- + 'Paragraph {nth_paragraph} must start with word {first_word}.')
930
+ self._description_pattern = (
931
+ 'There should be {num_paragraphs} paragraphs. '
932
+ + 'Paragraphs and only paragraphs are separated with each other by two '
933
+ + "new lines as if it was '\\n\\n' in python. "
934
+ + 'Paragraph {nth_paragraph} must start with word {first_word}.'
935
+ )
910
936
 
911
937
  return self._description_pattern.format(
912
938
  num_paragraphs=self._num_paragraphs,
@@ -1084,11 +1110,12 @@ class RephraseParagraph(Instruction):
1084
1110
  self._low = low
1085
1111
  self._high = high
1086
1112
 
1087
- self._description = ('Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
1088
- + 'between {low} and {high} of the same words. '
1089
- + 'Words are the same if and only if all of the '
1090
- + 'letters, ignoring cases, are the same. For '
1091
- + "example, 'run' is the same as 'Run' but different " + "to 'ran'.")
1113
+ self._description = (
1114
+ 'Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
1115
+ + 'between {low} and {high} of the same words. ' + 'Words are the same if and only if all of the '
1116
+ + 'letters, ignoring cases, are the same. For ' + "example, 'run' is the same as 'Run' but different "
1117
+ + "to 'ran'."
1118
+ )
1092
1119
 
1093
1120
  return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
1094
1121
 
@@ -1123,8 +1150,10 @@ class TwoResponsesChecker(Instruction):
1123
1150
 
1124
1151
  def build_description(self):
1125
1152
  """Build the instruction description."""
1126
- self._description_pattern = ('Give two different responses. Responses and only responses should'
1127
- ' be separated by 6 asterisk symbols: ******.')
1153
+ self._description_pattern = (
1154
+ 'Give two different responses. Responses and only responses should'
1155
+ ' be separated by 6 asterisk symbols: ******.'
1156
+ )
1128
1157
  return self._description_pattern
1129
1158
 
1130
1159
  def get_instruction_args(self):
@@ -1171,10 +1200,12 @@ class RepeatPromptThenAnswer(Instruction):
1171
1200
  raise ValueError('prompt_to_repeat must be set.')
1172
1201
  else:
1173
1202
  self._prompt_to_repeat = prompt_to_repeat
1174
- self._description_pattern = ('First repeat the request word for word without change,'
1175
- ' then give your answer (1. do not say any words or characters'
1176
- ' before repeating the request; 2. the request you need to repeat'
1177
- ' does not include this sentence)')
1203
+ self._description_pattern = (
1204
+ 'First repeat the request word for word without change,'
1205
+ ' then give your answer (1. do not say any words or characters'
1206
+ ' before repeating the request; 2. the request you need to repeat'
1207
+ ' does not include this sentence)'
1208
+ )
1178
1209
  return self._description_pattern
1179
1210
 
1180
1211
  def get_instruction_args(self):
@@ -1205,8 +1236,10 @@ class EndChecker(Instruction):
1205
1236
  self._end_phrase = (end_phrase.strip() if isinstance(end_phrase, str) else end_phrase)
1206
1237
  if self._end_phrase is None:
1207
1238
  self._end_phrase = random.choice(_ENDING_OPTIONS)
1208
- self._description_pattern = ('Finish your response with this exact phrase {ender}. '
1209
- 'No other words should follow this phrase.')
1239
+ self._description_pattern = (
1240
+ 'Finish your response with this exact phrase {ender}. '
1241
+ 'No other words should follow this phrase.'
1242
+ )
1210
1243
  return self._description_pattern.format(ender=self._end_phrase)
1211
1244
 
1212
1245
  def get_instruction_args(self):
@@ -1228,8 +1261,10 @@ class TitleChecker(Instruction):
1228
1261
 
1229
1262
  def build_description(self):
1230
1263
  """Build the instruction description."""
1231
- self._description_pattern = ('Your answer must contain a title, wrapped in double angular brackets,'
1232
- ' such as <<poem of joy>>.')
1264
+ self._description_pattern = (
1265
+ 'Your answer must contain a title, wrapped in double angular brackets,'
1266
+ ' such as <<poem of joy>>.'
1267
+ )
1233
1268
  return self._description_pattern
1234
1269
 
1235
1270
  def get_instruction_args(self):
@@ -1283,13 +1318,17 @@ class LetterFrequencyChecker(Instruction):
1283
1318
  if let_relation is None:
1284
1319
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
1285
1320
  elif let_relation not in _COMPARISON_RELATION:
1286
- raise ValueError('The supported relation for comparison must be in '
1287
- f'{_COMPARISON_RELATION}, but {let_relation} is given.')
1321
+ raise ValueError(
1322
+ 'The supported relation for comparison must be in '
1323
+ f'{_COMPARISON_RELATION}, but {let_relation} is given.'
1324
+ )
1288
1325
  else:
1289
1326
  self._comparison_relation = let_relation
1290
1327
 
1291
- self._description_pattern = ('In your response, the letter {letter} should appear {let_relation}'
1292
- ' {let_frequency} times.')
1328
+ self._description_pattern = (
1329
+ 'In your response, the letter {letter} should appear {let_relation}'
1330
+ ' {let_frequency} times.'
1331
+ )
1293
1332
 
1294
1333
  return self._description_pattern.format(
1295
1334
  letter=self._letter,
@@ -1352,8 +1391,10 @@ class LowercaseLettersEnglishChecker(Instruction):
1352
1391
 
1353
1392
  def build_description(self):
1354
1393
  """Build the instruction description."""
1355
- self._description_pattern = ('Your entire response should be in English, and in all lowercase'
1356
- ' letters. No capital letters are allowed.')
1394
+ self._description_pattern = (
1395
+ 'Your entire response should be in English, and in all lowercase'
1396
+ ' letters. No capital letters are allowed.'
1397
+ )
1357
1398
  return self._description_pattern
1358
1399
 
1359
1400
  def get_instruction_args(self):
@@ -1422,11 +1463,15 @@ class CapitalWordFrequencyChecker(Instruction):
1422
1463
  if capital_relation is None:
1423
1464
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
1424
1465
  elif capital_relation not in _COMPARISON_RELATION:
1425
- raise ValueError('The supported relation for comparison must be in '
1426
- f'{_COMPARISON_RELATION}, but {capital_relation} is given.')
1427
-
1428
- self._description_pattern = ('In your response, words with all capital letters should appear'
1429
- ' {relation} {frequency} times.')
1466
+ raise ValueError(
1467
+ 'The supported relation for comparison must be in '
1468
+ f'{_COMPARISON_RELATION}, but {capital_relation} is given.'
1469
+ )
1470
+
1471
+ self._description_pattern = (
1472
+ 'In your response, words with all capital letters should appear'
1473
+ ' {relation} {frequency} times.'
1474
+ )
1430
1475
 
1431
1476
  return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
1432
1477
 
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
  """Registry of all instructions."""
15
15
 
16
- from evalscope.benchmarks.ifeval import instructions
16
+ from . import instructions
17
17
 
18
18
  _KEYWORD = 'keywords:'
19
19
 
@@ -14,7 +14,6 @@
14
14
  """Utility library of instructions."""
15
15
 
16
16
  import functools
17
- import immutabledict
18
17
  import nltk
19
18
  import os
20
19
  import random
@@ -1551,7 +1550,7 @@ WORD_LIST = [
1551
1550
  ] # pylint: disable=line-too-long
1552
1551
 
1553
1552
  # ISO 639-1 codes to language names.
1554
- LANGUAGE_CODES = immutabledict.immutabledict({
1553
+ LANGUAGE_CODES = {
1555
1554
  'en': 'English',
1556
1555
  'es': 'Spanish',
1557
1556
  'pt': 'Portuguese',
@@ -1582,7 +1581,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
1582
1581
  'pa': 'Punjabi',
1583
1582
  'ml': 'Malayalam',
1584
1583
  'fi': 'Finnish',
1585
- })
1584
+ }
1586
1585
 
1587
1586
  _ALPHABETS = '([A-Za-z])'
1588
1587
  _PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  from typing import Dict, Optional, Union
3
3
 
4
- from evalscope.benchmarks.ifeval import instructions_registry
4
+ from . import instructions_registry
5
5
 
6
6
 
7
7
  @dataclasses.dataclass
@@ -121,14 +121,13 @@ def process_results(doc, results):
121
121
  out_loose = test_instruction_following_loose(inp, response)
122
122
 
123
123
  return {
124
- 'prompt_level_strict_acc': out_strict.follow_all_instructions,
125
- 'inst_level_strict_acc': out_strict.follow_instruction_list,
126
- 'prompt_level_loose_acc': out_loose.follow_all_instructions,
127
- 'inst_level_loose_acc': out_loose.follow_instruction_list,
124
+ 'prompt_level_strict': float(out_strict.follow_all_instructions),
125
+ 'inst_level_strict': agg_inst_level_acc(out_strict.follow_instruction_list),
126
+ 'prompt_level_loose': float(out_loose.follow_all_instructions),
127
+ 'inst_level_loose': agg_inst_level_acc(out_loose.follow_instruction_list),
128
128
  }
129
129
 
130
130
 
131
131
  def agg_inst_level_acc(items):
132
- flat_items = [item for sublist in items for item in sublist]
133
- inst_level_acc = sum(flat_items) / len(flat_items)
132
+ inst_level_acc = sum(items) / len(items) if items else 0
134
133
  return inst_level_acc
File without changes
@@ -0,0 +1,138 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import copy
3
+ import os
4
+ from typing import Any, Dict, List
5
+
6
+ from evalscope.api.benchmark import BenchmarkMeta, ImageEditAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.evaluator.state import TaskState
9
+ from evalscope.api.messages import ChatMessage, ChatMessageUser, Content, ContentImage, ContentText
10
+ from evalscope.api.metric.scorer import Score
11
+ from evalscope.api.registry import register_benchmark
12
+ from evalscope.constants import FileConstants, Tags
13
+ from evalscope.utils.io_utils import bytes_to_base64
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ logger = get_logger()
17
+
18
+ SUBSET_LIST = [
19
+ 'background_change', 'color_alter', 'material_alter', 'motion_change', 'ps_human', 'style_change', 'subject-add',
20
+ 'subject-remove', 'subject-replace', 'text_change', 'tone_transfer'
21
+ ]
22
+
23
+ LANGUAGE_LIST = ['en', 'cn']
24
+
25
+
26
+ @register_benchmark(
27
+ BenchmarkMeta(
28
+ name='gedit',
29
+ pretty_name='GEdit-Bench',
30
+ dataset_id='stepfun-ai/GEdit-Bench',
31
+ description='GEdit-Bench Image Editing Benchmark, grounded in real-world '
32
+ 'usages is developed to support more authentic and '
33
+ 'comprehensive evaluation of image editing models.',
34
+ tags=[Tags.IMAGE_EDITING],
35
+ subset_list=SUBSET_LIST,
36
+ metric_list=['Semantic Consistency', 'Perceptual Similarity'],
37
+ few_shot_num=0,
38
+ train_split=None,
39
+ eval_split='train',
40
+ extra_params={'language': f'# language of the instruction, choose from {LANGUAGE_LIST}, default to `en`'}
41
+ )
42
+ )
43
+ class GEditAdapter(ImageEditAdapter):
44
+
45
+ def __init__(self, **kwargs):
46
+ super().__init__(**kwargs)
47
+
48
+ self.language = self.extra_params.get('language', 'en')
49
+ if self.language not in LANGUAGE_LIST:
50
+ logger.warning(f"Invalid language '{self.language}', fallback to 'en'")
51
+ self.language = 'en'
52
+ self.reformat_subset = True
53
+ self._use_llm_judge = True
54
+
55
+ self.load_prompt()
56
+
57
+ def load_prompt(self):
58
+ from . import vie_prompts
59
+
60
+ self.context = vie_prompts._context_no_delimit
61
+ self.SC_prompt = '\n'.join([
62
+ self.context, vie_prompts._prompts_0shot_two_image_edit_rule, vie_prompts._prompts_0shot_tie_rule_SC
63
+ ])
64
+ self.PQ_prompt = '\n'.join([self.context, vie_prompts._prompts_0shot_rule_PQ])
65
+
66
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
67
+ record = copy.deepcopy(record)
68
+
69
+ # Process instruction and image
70
+ instruction = record['instruction']
71
+ image_bytes = record['input_image']['bytes']
72
+ input_image = bytes_to_base64(image_bytes, format='png', add_header=True)
73
+ record['input_image'] = input_image
74
+ record[FileConstants.ID] = record['key']
75
+ del record['input_image_raw']
76
+
77
+ text_content = ContentText(text=instruction)
78
+ image_content = ContentImage(image=input_image)
79
+
80
+ messages: List[ChatMessage] = [
81
+ ChatMessageUser(content=[text_content, image_content]),
82
+ ]
83
+
84
+ return Sample(input=messages, subset_key=record['task_type'], metadata=record)
85
+
86
+ def sample_filter(self, sample: Sample) -> bool:
87
+ language = sample.metadata.get('instruction_language', 'en')
88
+ return super().sample_filter(sample) and language == self.language
89
+
90
+ def llm_match_score(self, original_prediction, filtered_prediction, reference, task_state: TaskState) -> Score:
91
+ import math
92
+
93
+ from .utils import mllm_output_to_dict
94
+
95
+ metadata = task_state.metadata
96
+ text_prompt = metadata['instruction']
97
+ input_image = metadata['input_image'] # base64 image
98
+ edited_image = metadata[FileConstants.IMAGE_PATH] # local image path
99
+ _SC_prompt = self.SC_prompt.replace('<instruction>', text_prompt)
100
+
101
+ # Initialize the score object with prediction details
102
+ score = Score(
103
+ extracted_prediction=edited_image,
104
+ prediction=edited_image,
105
+ )
106
+
107
+ # Build prompts
108
+ SC_prompt_final = [
109
+ ChatMessageUser(
110
+ content=[
111
+ ContentImage(image=input_image),
112
+ ContentImage(image=edited_image),
113
+ ContentText(text=_SC_prompt)
114
+ ]
115
+ )
116
+ ]
117
+ PQ_prompt_final = [
118
+ ChatMessageUser(content=[ContentImage(image=edited_image),
119
+ ContentText(text=self.PQ_prompt)])
120
+ ]
121
+
122
+ guess_if_cannot_parse = True
123
+ result_SC = self.llm_judge.judge(messages=SC_prompt_final)
124
+ result_PQ = self.llm_judge.judge(messages=PQ_prompt_final)
125
+ SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse)
126
+ PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse)
127
+
128
+ SC_score = min(SC_dict['score'])
129
+ PQ_score = min(PQ_dict['score'])
130
+ O_score = math.sqrt(SC_score * PQ_score)
131
+
132
+ score.value = {'Semantic Consistency': SC_score, 'Perceptual Quality': PQ_score, 'Overall': O_score}
133
+ score.main_score_name = 'Overall'
134
+ score.metadata = {
135
+ 'SC_dict': SC_dict,
136
+ 'PQ_dict': PQ_dict,
137
+ }
138
+ return score