evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,83 @@
1
+ from evalscope.api.dataset.dataset import Sample
2
+ from evalscope.api.evaluator import Choices, Target, TaskState
3
+ from evalscope.utils.multi_choices import (
4
+ FEW_SHOT_TEMPLATE,
5
+ MultipleChoiceTemplate,
6
+ format_example,
7
+ parse_answers,
8
+ parse_answers_zh,
9
+ prompt,
10
+ valid_template,
11
+ )
12
+ from .default_data_adapter import DefaultDataAdapter
13
+
14
+
15
+ class MultiChoiceAdapter(DefaultDataAdapter):
16
+ """
17
+ Adapter for multi-choice benchmarks.
18
+ This adapter formats the input for multi-choice questions and handles few-shot examples.
19
+ """
20
+
21
+ multiple_correct: bool = False
22
+ """Whether the benchmark allows multiple correct answers."""
23
+
24
+ def format_prompt_template(self, sample: Sample) -> str:
25
+ """
26
+ Format the basic prompt template with the sample data.
27
+
28
+ Args:
29
+ sample (Sample): The sample object containing the prompt data
30
+
31
+ Returns:
32
+ str: The formatted prompt ready for model input
33
+ """
34
+ assert valid_template(self.prompt_template), 'Prompt template is not valid'
35
+
36
+ return prompt(
37
+ question=sample.input,
38
+ choices=Choices(sample.choices),
39
+ template=self.prompt_template,
40
+ )
41
+
42
+ def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
43
+ """
44
+ Format the few-shot template with demonstrations and the main prompt.
45
+
46
+ Args:
47
+ fewshot (str): The formatted few-shot demonstration examples
48
+ sample (Sample): The sample object containing the prompt data
49
+
50
+ Returns:
51
+ str: The complete formatted input with few-shot context
52
+ """
53
+
54
+ few_shot_prompt_template = self.few_shot_prompt_template or (FEW_SHOT_TEMPLATE + self.prompt_template)
55
+
56
+ assert valid_template(few_shot_prompt_template), 'Few-shot prompt template is not valid'
57
+
58
+ return prompt(
59
+ question=sample.input, choices=Choices(sample.choices), template=few_shot_prompt_template, fewshot=fewshot
60
+ )
61
+
62
+ def sample_to_fewshot(self, sample: Sample) -> str:
63
+ """
64
+ Convert a sample to a few-shot formatted string.
65
+
66
+ Args:
67
+ sample (Sample): The sample object to format
68
+
69
+ Returns:
70
+ str: The formatted few-shot example string
71
+ """
72
+ return format_example(question=sample.input, choices=Choices(sample.choices), answer=Target(sample.target))
73
+
74
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
75
+ if self.prompt_template in [
76
+ MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
77
+ MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE
78
+ ]:
79
+ # For Chinese COT template, we use a different extraction method
80
+ answers = parse_answers_zh(task_state, multiple_correct=self.multiple_correct)
81
+ else:
82
+ answers = parse_answers(task_state, multiple_correct=self.multiple_correct)
83
+ return ''.join(sorted(list(answers)))
@@ -0,0 +1,155 @@
1
+ import base64
2
+ import os
3
+
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages.chat_message import ChatMessageUser
7
+ from evalscope.api.messages.content import ContentImage
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput
10
+ from evalscope.api.registry import get_metric
11
+ from evalscope.constants import EvalType
12
+ from evalscope.utils import get_logger
13
+ from evalscope.utils.function_utils import thread_safe
14
+ from .default_data_adapter import DefaultDataAdapter
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ class Text2ImageAdapter(DefaultDataAdapter):
20
+ """Text to Image Adapter for benchmarks."""
21
+
22
+ def load_from_disk(self, **kwargs):
23
+ return super().load_from_disk(use_local_loader=True)
24
+
25
+ def record_to_sample(self, record) -> Sample:
26
+ """Convert a record dictionary to a Sample object."""
27
+ return Sample(
28
+ input=[ChatMessageUser(content=record['prompt'])],
29
+ metadata={
30
+ 'id': record['id'],
31
+ 'prompt': record['prompt'],
32
+ 'category': record.get('category', ''),
33
+ 'tags': record.get('tags', []),
34
+ 'image_path': record.get('image_path', ''), # Optional field for existing image path
35
+ }
36
+ )
37
+
38
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
39
+ """
40
+ Hook method called during the actual inference process.
41
+
42
+ This method executes the model inference and can be overridden
43
+ to implement custom inference logic or model interaction patterns.
44
+
45
+ Args:
46
+ model (Model): The model to use for inference
47
+ sample (Sample): The sample to process
48
+
49
+ Returns:
50
+ ModelOutput: The raw output from the model
51
+ """
52
+ if self.eval_type == EvalType.MOCK_LLM:
53
+ return ModelOutput(
54
+ model=model.name,
55
+ choices=[ChatCompletionChoice.from_content('')],
56
+ )
57
+ else:
58
+ # Execute model inference with the processed input and any tools
59
+ model_output = model.generate(input=sample.input, tools=sample.tools)
60
+ return model_output
61
+
62
+ def _on_inference_end(
63
+ self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
64
+ ) -> TaskState:
65
+ """
66
+ Hook method called after inference completes. Save generated images to output_dir.
67
+
68
+ Args:
69
+ model (Model): The model that performed inference
70
+ sample (Sample): The processed sample
71
+ model_output (ModelOutput): The raw model output
72
+ output_dir (str): The directory where the model output was saved
73
+
74
+ Returns:
75
+ TaskState: Complete state object for the inference task
76
+ """
77
+ if self.eval_type == EvalType.MOCK_LLM:
78
+ return TaskState(
79
+ model=model.name,
80
+ sample=sample,
81
+ messages=[model_output.message],
82
+ output=model_output,
83
+ completed=True,
84
+ )
85
+ else:
86
+ image_id = f"{sample.metadata.get('id',sample.id)}_{sample.group_id}"
87
+ output_path = os.path.join(output_dir, 'images', f'{image_id}.png')
88
+ if not os.path.exists(os.path.dirname(output_path)):
89
+ os.makedirs(os.path.dirname(output_path))
90
+ # get base64 image from model_output
91
+ content = model_output.message.content[0]
92
+
93
+ assert isinstance(content, ContentImage), 'Expected ContentImage in model output'
94
+
95
+ image_base64 = content.image
96
+ with open(output_path, 'wb') as f:
97
+ f.write(base64.b64decode(image_base64))
98
+
99
+ sample.metadata['image_path'] = output_path
100
+ return TaskState(
101
+ model=model.name,
102
+ sample=sample,
103
+ messages=[model_output.message],
104
+ output=model_output,
105
+ completed=True,
106
+ )
107
+
108
+ # NOTE: thread safe is needed, since we can't batch inference here.
109
+ @thread_safe
110
+ def match_score(
111
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
112
+ ) -> Score:
113
+ # Get prediction and prompt from task state
114
+ image_path = task_state.metadata.get('image_path', original_prediction)
115
+ prompt = task_state.input[0].content
116
+ meta = task_state.metadata
117
+
118
+ # Initialize the score object with prediction details
119
+ score = Score(
120
+ extracted_prediction=image_path,
121
+ prediction=image_path,
122
+ )
123
+
124
+ # Calculate scores for each configured metric
125
+ for metric in self.metric_list:
126
+ try:
127
+ if isinstance(metric, str):
128
+ metric_name = metric
129
+ metric_scorer = get_metric(metric) # Get metric implementation from registry
130
+ metric_func = metric_scorer() # Instantiate the metric scorer
131
+ elif isinstance(metric, dict):
132
+ metric_name = list(metric.keys())[0]
133
+ metric_cls = get_metric(metric_name)
134
+ metric_func = metric_cls(**metric[metric_name]) # Initialize with parameters
135
+ metric_score = metric_func(image_path, prompt)[0]
136
+
137
+ # fine-granular metrics
138
+ category = meta.get('category')
139
+ if category:
140
+ metric_name = f'{metric_name}_{category}'
141
+ if isinstance(metric_score, dict):
142
+ for k, v in metric_score.items():
143
+ score.value[f'{metric_name}_{k}'] = v.cpu().item()
144
+ else:
145
+ score.value[metric_name] = metric_score.cpu().item()
146
+ except Exception as e:
147
+ logger.error(f'Error calculating metric {metric}: {e}')
148
+ score.value[metric_name] = 0
149
+ score.metadata[metric_name] = f'error: {str(e)}'
150
+
151
+ return score
152
+
153
+ def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
154
+ # Don't add aggregation name for needle haystack adapter
155
+ return super()._on_generate_report(scores, model_name, False)
@@ -0,0 +1,321 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import contextlib
4
+ from abc import ABC, abstractmethod
5
+ from collections import OrderedDict
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
7
+
8
+ from evalscope.api.dataset import DatasetDict, Sample
9
+ from evalscope.api.evaluator import TaskState
10
+ from evalscope.api.filter import FilterEnsemble, build_filter_ensemble
11
+ from evalscope.api.metric import AggScore, SampleScore
12
+ from evalscope.api.mixin import LLMJudgeMixin
13
+ from evalscope.api.model import Model
14
+ from evalscope.report import Report
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ if TYPE_CHECKING:
18
+ from evalscope.api.benchmark import BenchmarkMeta
19
+ from evalscope.config import TaskConfig
20
+
21
+ logger = get_logger()
22
+
23
+
24
+ class DataAdapter(LLMJudgeMixin, ABC):
25
+ """
26
+ Data Adapter for the benchmark.
27
+ """
28
+
29
+ def __init__(self, benchmark_meta: 'BenchmarkMeta', task_config: Optional['TaskConfig'] = None):
30
+ self._benchmark_meta = benchmark_meta
31
+ self._task_config = task_config
32
+ super().__init__(task_config=task_config)
33
+
34
+ self.reformat_subset = False
35
+ """Whether to reformat the subset data with subset key"""
36
+
37
+ self.split_as_subset = False
38
+ """Whether to use the split name as the dataset subsets"""
39
+
40
+ self.shuffle_choices = False
41
+ """Whether to shuffle the choices in the dataset"""
42
+
43
+ self.save_metadata = True
44
+ """Whether to save metadata in the review result"""
45
+
46
+ self.category_map = {}
47
+ """Category map for the benchmark"""
48
+
49
+ self.current_subset_name = ''
50
+ """Subset name when loading datasets"""
51
+
52
+ # dataset
53
+ self.test_dataset: Optional[DatasetDict] = None
54
+ """Dataset to be evaluated"""
55
+
56
+ self.fewshot_dataset: Optional[DatasetDict] = None
57
+ """Dataset for few-shot evaluation"""
58
+
59
+ # filters
60
+ self._filter_ensemble: Optional[OrderedDict] = None
61
+
62
+ def to_dict(self) -> Dict[str, Any]:
63
+ """Convert the benchmark metadata to a dictionary."""
64
+ return self._benchmark_meta.to_string_dict()
65
+
66
+ @abstractmethod
67
+ def load_dataset(self) -> DatasetDict:
68
+ pass
69
+
70
+ @abstractmethod
71
+ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
72
+ pass
73
+
74
+ @abstractmethod
75
+ def calculate_metrics(self, task_state: TaskState) -> SampleScore:
76
+ pass
77
+
78
+ @abstractmethod
79
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
80
+ pass
81
+
82
+ @abstractmethod
83
+ def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
84
+ """
85
+ Generate a report based on the evaluation results.
86
+ """
87
+ pass
88
+
89
+ @property
90
+ def name(self) -> str:
91
+ """
92
+ Return the unique name of the benchmark.
93
+ """
94
+ return self._benchmark_meta.name
95
+
96
+ @property
97
+ def dataset_id(self) -> str:
98
+ """
99
+ Return the dataset ID or path to the benchmark.
100
+ """
101
+ return self._benchmark_meta.dataset_id
102
+
103
+ @property
104
+ def output_types(self) -> Optional[List[str]]:
105
+ """
106
+ Return the output types of the benchmark.
107
+ """
108
+ return self._benchmark_meta.output_types
109
+
110
+ @property
111
+ def limit(self) -> Optional[Union[int, float]]:
112
+ """
113
+ Return the limit for the benchmark.
114
+ """
115
+ return self._task_config.limit
116
+
117
+ @property
118
+ def repeats(self) -> int:
119
+ """
120
+ Return the number of repeats for each sample in the benchmark.
121
+ """
122
+ return self._task_config.repeats
123
+
124
+ @property
125
+ def dataset_hub(self) -> str:
126
+ """
127
+ Return the dataset hub type for the benchmark.
128
+ """
129
+ return self._task_config.dataset_hub
130
+
131
+ @dataset_hub.setter
132
+ def dataset_hub(self, value: str):
133
+ """
134
+ Set the dataset hub type for the benchmark.
135
+ """
136
+ self._task_config.dataset_hub = value
137
+
138
+ @property
139
+ def eval_type(self) -> str:
140
+ """
141
+ Return the evaluation type for the benchmark.
142
+ """
143
+ return self._task_config.eval_type
144
+
145
+ @property
146
+ def subset_list(self) -> List[str]:
147
+ """
148
+ Return the subset list of the benchmark.
149
+ """
150
+ return self._benchmark_meta.subset_list
151
+
152
+ @subset_list.setter
153
+ def subset_list(self, value: List[str]):
154
+ """
155
+ Set the subset list of the benchmark.
156
+ """
157
+ self._benchmark_meta.subset_list = value
158
+
159
+ @property
160
+ def metric_list(self) -> List[Union[str, Dict[str, Any]]]:
161
+ """
162
+ Return the metric list of the benchmark.
163
+ """
164
+ return self._benchmark_meta.metric_list
165
+
166
+ @property
167
+ def default_subset(self) -> str:
168
+ """
169
+ Return the default subset of the benchmark.
170
+ """
171
+ return self._benchmark_meta.default_subset
172
+
173
+ @property
174
+ def few_shot_num(self) -> int:
175
+ """
176
+ Return the few shot number of the benchmark.
177
+ """
178
+ return self._benchmark_meta.few_shot_num
179
+
180
+ @few_shot_num.setter
181
+ def few_shot_num(self, value: int):
182
+ """
183
+ Set the few shot number of the benchmark.
184
+ """
185
+ self._benchmark_meta.few_shot_num = value
186
+
187
+ @property
188
+ def few_shot_random(self) -> bool:
189
+ """
190
+ Return whether few shot is random for the benchmark.
191
+ """
192
+ return self._benchmark_meta.few_shot_random
193
+
194
+ @property
195
+ def train_split(self) -> Optional[str]:
196
+ """
197
+ Return the train split of the benchmark.
198
+ """
199
+ return self._benchmark_meta.train_split
200
+
201
+ @property
202
+ def eval_split(self) -> Optional[str]:
203
+ """
204
+ Return the eval split of the benchmark.
205
+ """
206
+ return self._benchmark_meta.eval_split
207
+
208
+ @property
209
+ def prompt_template(self) -> Optional[str]:
210
+ """
211
+ Return the prompt template of the benchmark.
212
+ """
213
+ return self._benchmark_meta.prompt_template
214
+
215
+ @prompt_template.setter
216
+ def prompt_template(self, value: str):
217
+ """
218
+ Set the prompt template of the benchmark.
219
+ """
220
+ self._benchmark_meta.prompt_template = value
221
+
222
+ @property
223
+ def system_prompt(self) -> Optional[str]:
224
+ """
225
+ Return the system prompt of the benchmark.
226
+ """
227
+ return self._benchmark_meta.system_prompt
228
+
229
+ @property
230
+ def query_template(self) -> Optional[str]:
231
+ """
232
+ Return the query template of the benchmark.
233
+ """
234
+ return self._benchmark_meta.query_template
235
+
236
+ @property
237
+ def few_shot_prompt_template(self) -> Optional[str]:
238
+ """
239
+ Return the few-shot prompt template of the benchmark.
240
+ """
241
+ return self._benchmark_meta.few_shot_prompt_template
242
+
243
+ @property
244
+ def pretty_name(self) -> Optional[str]:
245
+ """
246
+ Return the pretty name of the benchmark.
247
+ """
248
+ return self._benchmark_meta.pretty_name
249
+
250
+ @property
251
+ def description(self) -> Optional[str]:
252
+ """
253
+ Return the description of the benchmark.
254
+ """
255
+ return self._benchmark_meta.description
256
+
257
+ @property
258
+ def tags(self) -> Optional[List[str]]:
259
+ """
260
+ Return the tags of the benchmark.
261
+ """
262
+ return self._benchmark_meta.tags
263
+
264
+ @property
265
+ def filters(self) -> Optional[OrderedDict]:
266
+ """
267
+ Return the filters of the benchmark.
268
+ """
269
+ return self._benchmark_meta.filters
270
+
271
+ @property
272
+ def filter_ensemble(self) -> Optional[FilterEnsemble]:
273
+ """
274
+ Return the filter ensemble of the benchmark.
275
+ """
276
+ if self._filter_ensemble is None:
277
+ if self.filters:
278
+ self._filter_ensemble = build_filter_ensemble(filters=self.filters)
279
+ return self._filter_ensemble
280
+
281
+ @property
282
+ def aggregation(self) -> str:
283
+ """
284
+ Return the aggregation function for the metrics.
285
+ """
286
+ return self._benchmark_meta.aggregation
287
+
288
+ @property
289
+ def extra_params(self) -> Optional[Dict]:
290
+ """
291
+ Return the extra parameters of the benchmark.
292
+ """
293
+ return self._benchmark_meta.extra_params
294
+
295
+ @property
296
+ def seed(self) -> Optional[int]:
297
+ """
298
+ Return the seed for the benchmark.
299
+ """
300
+ return self._task_config.seed
301
+
302
+ @contextlib.contextmanager
303
+ def _temporary_attribute(self, attr_name: str, new_value):
304
+ """
305
+ Set a temporary value for an attribute and restore the original value after the context block.
306
+
307
+ Args:
308
+ attr_name: The name of the attribute to temporarily set.
309
+ new_value: The new value to set for the attribute.
310
+ """
311
+ had_attr = hasattr(self, attr_name)
312
+ original_value = getattr(self, attr_name, None) if had_attr else None
313
+
314
+ setattr(self, attr_name, new_value)
315
+ try:
316
+ yield
317
+ finally:
318
+ if had_attr:
319
+ setattr(self, attr_name, original_value)
320
+ else:
321
+ delattr(self, attr_name)
@@ -0,0 +1,115 @@
1
+ import copy
2
+ from collections import OrderedDict
3
+ from dataclasses import asdict, dataclass, field
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union
5
+
6
+ from evalscope.constants import OutputType
7
+
8
+ if TYPE_CHECKING:
9
+ from evalscope.api.benchmark import DataAdapter
10
+
11
+
12
+ @dataclass
13
+ class BenchmarkMeta:
14
+ """Metadata for a benchmark, including dataset and model configurations."""
15
+
16
+ name: str
17
+ """ Unique name of the benchmark."""
18
+
19
+ dataset_id: str
20
+ """ Dataset id on modelscope or path to local dataset."""
21
+
22
+ data_adapter: Optional[Type['DataAdapter']] = None
23
+ """ Data adapter class for the benchmark."""
24
+
25
+ output_types: List[str] = field(default_factory=lambda: [OutputType.GENERATION])
26
+ """ List of output types supported by the benchmark."""
27
+
28
+ subset_list: List[str] = field(default_factory=lambda: ['default'])
29
+ """ List of subsets available for the benchmark."""
30
+
31
+ default_subset: str = 'default'
32
+ """ Default subset to use for the benchmark."""
33
+
34
+ few_shot_num: int = 0
35
+ """ Number of few-shot examples to use."""
36
+
37
+ few_shot_random: bool = False
38
+ """ Whether to use random few-shot examples."""
39
+
40
+ train_split: Optional[str] = None
41
+ """ Training split to use for the benchmark."""
42
+
43
+ eval_split: Optional[str] = None
44
+ """ Evaluation split to use for the benchmark."""
45
+
46
+ prompt_template: Optional[str] = None
47
+ """ Prompt template to use for the benchmark."""
48
+
49
+ few_shot_prompt_template: Optional[str] = None
50
+ """ Few-shot prompt template to use for the benchmark."""
51
+
52
+ system_prompt: Optional[str] = None
53
+ """ System prompt to use for the benchmark."""
54
+
55
+ query_template: Optional[str] = None
56
+ """ Query template to use for the benchmark."""
57
+
58
+ pretty_name: Optional[str] = None
59
+ """ Human-readable name for the benchmark."""
60
+
61
+ description: Optional[str] = None
62
+ """ Description of the benchmark."""
63
+
64
+ tags: List[str] = field(default_factory=list)
65
+ """ Tags associated with the benchmark."""
66
+
67
+ filters: Optional[OrderedDict] = None
68
+ """ Filters to apply to the dataset on model output."""
69
+
70
+ metric_list: List[Union[str, Dict[str, Any]]] = field(default_factory=list)
71
+ """ List of metrics to evaluate the benchmark."""
72
+
73
+ aggregation: str = 'mean'
74
+ """ Aggregation function for the metrics. Default is 'mean'. Can be 'mean', 'pass@<k>' or a custom function name."""
75
+
76
+ extra_params: Dict = field(default_factory=dict)
77
+ """ Additional parameters for the benchmark."""
78
+
79
+ def __post_init__(self):
80
+ """Validate fields after initialization."""
81
+ if self.few_shot_num < 0:
82
+ raise ValueError('few_shot_num must be >= 0')
83
+
84
+ def _update(self, args: dict):
85
+ """Update instance with provided arguments, maintaining backward compatibility."""
86
+ args = copy.deepcopy(args)
87
+
88
+ if args.get('local_path'):
89
+ self.dataset_id = args['local_path']
90
+ del args['local_path']
91
+
92
+ if args.get('filters'):
93
+ if self.filters is None:
94
+ self.filters = OrderedDict()
95
+ new_filters = OrderedDict(args['filters'])
96
+ # insert filters at the beginning
97
+ self.filters = OrderedDict(list(new_filters.items()) + list(self.filters.items()))
98
+ del args['filters']
99
+ # Update fields with validation
100
+ for key, value in args.items():
101
+ if hasattr(self, key):
102
+ setattr(self, key, value) # Validate few_shot_num if it's being updated
103
+ if key == 'few_shot_num' and value < 0:
104
+ raise ValueError('few_shot_num must be >= 0')
105
+
106
+ def to_dict(self) -> dict:
107
+ """Convert to dictionary, maintaining backward compatibility."""
108
+ return asdict(self)
109
+
110
+ def to_string_dict(self) -> dict:
111
+ """Convert to string dictionary, excluding data_adapter."""
112
+ cur_dict = copy.deepcopy(asdict(self))
113
+ if 'data_adapter' in cur_dict:
114
+ del cur_dict['data_adapter']
115
+ return cur_dict
@@ -0,0 +1,2 @@
1
+ from .dataset import Dataset, DatasetDict, MemoryDataset, Sample
2
+ from .loader import DataLoader, DictDataLoader, LocalDataLoader, RemoteDataLoader