evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -2,11 +2,10 @@ import os
2
2
  from langchain_core.callbacks.manager import CallbackManagerForLLMRun
3
3
  from langchain_core.language_models.llms import LLM as BaseLLM
4
4
  from langchain_openai import ChatOpenAI
5
- from transformers.generation.configuration_utils import GenerationConfig
6
5
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
6
 
8
- from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models import ChatGenerationModelAdapter, LocalModel
7
+ from evalscope.api.model import GenerateConfig, Model, get_model
8
+ from evalscope.constants import DEFAULT_MODEL_REVISION, EvalType
10
9
 
11
10
 
12
11
  class LLM:
@@ -30,16 +29,19 @@ class LocalLLM(BaseLLM):
30
29
  model_name_or_path: str
31
30
  model_revision: str = DEFAULT_MODEL_REVISION
32
31
  template_type: Optional[str] = None
33
- model_name: Optional[str]
34
- model: Optional[ChatGenerationModelAdapter]
35
- generation_config: Optional[Dict]
32
+ model_name: Optional[str] = None
33
+ model: Optional[Model] = None
34
+ generation_config: Optional[Dict] = {}
36
35
 
37
36
  def __init__(self, **kw):
38
37
  super().__init__(**kw)
39
38
  self.model_name = os.path.basename(self.model_name_or_path)
40
- self.model = ChatGenerationModelAdapter(
41
- model=LocalModel(model_id=self.model_name_or_path, model_revision=self.model_revision),
42
- generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
39
+
40
+ # Create and initialize the local model
41
+ self.model = get_model(
42
+ model=self.model_name_or_path,
43
+ eval_type=EvalType.CHECKPOINT,
44
+ config=GenerateConfig(**self.generation_config),
43
45
  )
44
46
 
45
47
  def _call(
@@ -50,10 +52,9 @@ class LocalLLM(BaseLLM):
50
52
  **kwargs: Any,
51
53
  ) -> str:
52
54
  """Run the LLM on the given input."""
53
- infer_cfg = {'stop': stop}
54
55
 
55
- response, _ = self.model.predict([{'data': [prompt]}], infer_cfg=infer_cfg)
56
- return response[0][0]
56
+ response = self.model.generate(input=prompt)
57
+ return response.completion
57
58
 
58
59
  @property
59
60
  def _identifying_params(self) -> Dict[str, Any]:
@@ -4,8 +4,6 @@ import importlib
4
4
  import os
5
5
  import time
6
6
 
7
- from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
8
- from evalscope.benchmarks.data_adapter import DataAdapter
9
7
  from evalscope.utils import get_logger
10
8
 
11
9
  logger = get_logger()
File without changes
@@ -0,0 +1,44 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from collections import defaultdict
4
+ from typing import List, Optional, Union
5
+
6
+ from evalscope.utils.io_utils import jsonl_to_list
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ class GeneralI2IAdapter:
13
+
14
+ def __init__(self, **kwargs):
15
+
16
+ super().__init__(**kwargs)
17
+
18
+ def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
19
+ dataset_name_or_path = dataset_name_or_path or self.dataset_id
20
+ subset_list = subset_list or self.subset_list
21
+
22
+ data_file_dict = defaultdict(str)
23
+ data_item_dict = defaultdict(list)
24
+
25
+ # get data file path and subset name
26
+ if os.path.isdir(dataset_name_or_path):
27
+ for subset_name in subset_list:
28
+ data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
29
+ elif os.path.isfile(dataset_name_or_path):
30
+ cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
31
+ data_file_dict[cur_subset_name] = dataset_name_or_path
32
+ else:
33
+ raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
34
+
35
+ # load data from local disk
36
+ try:
37
+ for subset_name, file_path in data_file_dict.items():
38
+ data_item_dict[subset_name] = jsonl_to_list(file_path)
39
+ except Exception as e:
40
+ raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
41
+
42
+ data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
43
+
44
+ return data_dict
@@ -1,78 +1,76 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
2
  from collections import defaultdict
4
3
  from typing import List, Optional, Union
5
4
 
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
5
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
6
+ from evalscope.api.metric.scorer import AggScore, Score
7
+ from evalscope.api.registry import get_metric, register_benchmark
8
+ from evalscope.constants import Tags
8
9
  from evalscope.metrics import mean
9
- from evalscope.utils.io_utils import jsonl_to_list
10
+ from evalscope.utils.function_utils import thread_safe
10
11
  from evalscope.utils.logger import get_logger
11
- from .base import T2IBaseAdapter
12
12
 
13
13
  logger = get_logger()
14
14
 
15
15
 
16
- @Benchmark.register(
17
- name='evalmuse',
18
- dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
- model_adapter=OutputType.IMAGE_GENERATION,
20
- output_types=[OutputType.IMAGE_GENERATION],
21
- subset_list=['EvalMuse'],
22
- metric_list=['FGA_BLIP2Score'],
23
- few_shot_num=0,
24
- train_split=None,
25
- eval_split='test',
16
+ @register_benchmark(
17
+ BenchmarkMeta(
18
+ name='evalmuse',
19
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
20
+ description='EvalMuse Text-to-Image Benchmark',
21
+ tags=[Tags.TEXT_TO_IMAGE],
22
+ subset_list=['EvalMuse'],
23
+ metric_list=['FGA_BLIP2Score'],
24
+ few_shot_num=0,
25
+ train_split=None,
26
+ eval_split='test',
27
+ )
26
28
  )
27
- class EvalMuseAdapter(T2IBaseAdapter):
29
+ class EvalMuseAdapter(Text2ImageAdapter):
28
30
 
29
31
  def __init__(self, **kwargs):
30
32
  super().__init__(**kwargs)
33
+ assert len(self.metric_list
34
+ ) == 1 and self.metric_list[0] == 'FGA_BLIP2Score', 'Only FGA_BLIP2Score is supported for EvalMuse'
31
35
 
32
- def load(self, **kwargs) -> dict:
33
- if os.path.isfile(self.dataset_id):
34
- data_list = jsonl_to_list(self.dataset_id)
35
- data_dict = {self.subset_list[0]: {'test': data_list}}
36
- return data_dict
37
- else:
38
- return super().load(**kwargs)
36
+ @thread_safe
37
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state):
38
+ # Get prediction and prompt from task state
39
+ image_path = task_state.metadata.get('image_path', original_prediction)
39
40
 
40
- def get_gold_answer(self, input_d: dict) -> dict:
41
- # return prompt and elements dict
42
- return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
41
+ # Initialize the score object with prediction details
42
+ score = Score(
43
+ extracted_prediction=image_path,
44
+ prediction=image_path,
45
+ )
43
46
 
44
- def match(self, gold: dict, pred: str) -> dict:
45
- # dummy match for general t2i
46
- # pred is the image path, gold is the prompt
47
- res = {}
48
- for metric_name, metric_func in self.metrics.items():
49
- if metric_name == 'FGA_BLIP2Score':
50
- # For FGA_BLIP2Score, we need to pass the dictionary
51
- score = metric_func(images=[pred], texts=[gold])[0][0]
52
- else:
53
- score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
54
- if isinstance(score, dict):
55
- for k, v in score.items():
56
- res[f'{metric_name}:{k}'] = v.cpu().item()
57
- else:
58
- res[metric_name] = score.cpu().item()
59
- return res
47
+ # Calculate scores for each configured metric
48
+ try:
49
+ metric_name = self.metric_list[0]
50
+ metric_cls = get_metric(metric_name)
51
+ metric_func = metric_cls() # Initialize with parameters
52
+ metric_score = metric_func(image_path, task_state.metadata)[0]
60
53
 
61
- def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
62
- """
63
- compute weighted mean of the bleu score of all samples
64
- """
65
- items = super().compute_dict_metric(review_res_list, **kwargs)
66
- # add statistics for each metric
54
+ for k, v in metric_score.items():
55
+ score.value[f'{metric_name}:{k}'] = v.cpu().item()
56
+ except Exception as e:
57
+ logger.error(f'Error calculating metric {metric_name}: {e}')
58
+ score.value[metric_name] = 0
59
+ score.metadata[metric_name] = f'error: {str(e)}'
60
+
61
+ return score
62
+
63
+ def aggregate_scores(self, sample_scores) -> List[AggScore]:
67
64
  new_items = defaultdict(list)
68
- for metric_name, value_list in items.items():
69
- if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
65
+ agg_list = []
66
+ for sample_score in sample_scores:
67
+ for metric_name, value in sample_score.score.value.items():
70
68
  metrics_prefix = metric_name.split(':')[0]
71
69
  category = metric_name.rpartition('(')[-1].split(')')[0]
72
70
  category = category.split('-')[0].lower() # remove the suffix if exists
73
- new_items[f'{metrics_prefix}:{category}'].extend(value_list)
74
- else:
75
- new_items[metric_name].extend(value_list)
71
+ new_items[f'{metrics_prefix}:{category}'].append(value)
72
+
73
+ for k, v in new_items.items():
74
+ agg_list.append(AggScore(metric_name=k, score=mean(v), num=len(v)))
76
75
 
77
- # calculate mean for each metric
78
- return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in new_items.items()]
76
+ return agg_list
@@ -1,58 +1,53 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.utils.io_utils import jsonl_to_list
2
+ import os
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser
7
+ from evalscope.api.metric.scorer import Score
8
+ from evalscope.api.registry import get_metric, register_benchmark
9
+ from evalscope.constants import Tags
9
10
  from evalscope.utils.logger import get_logger
10
- from .base import T2IBaseAdapter
11
11
 
12
12
  logger = get_logger()
13
13
 
14
14
 
15
- @Benchmark.register(
16
- name='genai_bench',
17
- dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- model_adapter=OutputType.IMAGE_GENERATION,
19
- output_types=[OutputType.IMAGE_GENERATION],
20
- subset_list=['GenAI-Bench-1600'],
21
- metric_list=['VQAScore'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
15
+ @register_benchmark(
16
+ BenchmarkMeta(
17
+ name='genai_bench',
18
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
+ description='GenAI-Bench Text-to-Image Benchmark',
20
+ tags=[Tags.TEXT_TO_IMAGE],
21
+ subset_list=['GenAI-Bench-1600'],
22
+ metric_list=['VQAScore'],
23
+ few_shot_num=0,
24
+ train_split=None,
25
+ eval_split='test',
26
+ )
25
27
  )
26
- class GenAIBenchAdapter(T2IBaseAdapter):
28
+ class GenAIBenchAdapter(Text2ImageAdapter):
27
29
 
28
30
  def __init__(self, **kwargs):
29
31
  super().__init__(**kwargs)
30
32
 
31
- def load(self, **kwargs) -> dict:
33
+ def load_from_disk(self, **kwargs):
32
34
  if os.path.isfile(self.dataset_id):
33
- data_list = jsonl_to_list(self.dataset_id)
34
- data_dict = {self.subset_list[0]: {'test': data_list}}
35
- return data_dict
36
- else:
37
- return super().load(**kwargs)
38
-
39
- def get_gold_answer(self, input_d: dict) -> dict:
40
- # return prompt and elements dict
41
- return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
42
-
43
- def match(self, gold: dict, pred: str) -> dict:
44
- # dummy match for general t2i
45
- # pred is the image path, gold is the prompt
46
- res = {}
47
- for metric_name, metric_func in self.metrics.items():
48
- score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
49
-
50
- res[metric_name] = score.cpu().item()
51
-
52
- # fine-granular metrics
53
- if gold['tags'].get('advanced'):
54
- res[f'{metric_name}_advanced'] = score.cpu().item()
55
- else:
56
- res[f'{metric_name}_basic'] = score.cpu().item()
57
-
58
- return res
35
+ file_name = os.path.basename(self.dataset_id)
36
+ file_without_ext = os.path.splitext(file_name)[0]
37
+ self.subset_list = [file_without_ext]
38
+
39
+ return super().load_from_disk(use_local_loader=True)
40
+
41
+ def record_to_sample(self, record) -> Sample:
42
+ """Convert a record dictionary to a Sample object."""
43
+ advanced = record['tags'].get('advanced')
44
+ return Sample(
45
+ input=[ChatMessageUser(content=record['prompt'])],
46
+ metadata={
47
+ 'id': record['id'],
48
+ 'prompt': record['prompt'],
49
+ 'category': 'advanced' if advanced else 'basic',
50
+ 'tags': record.get('tags', []),
51
+ 'image_path': record.get('image_path', ''), # Optional field for existing image path
52
+ }
53
+ )
@@ -1,58 +1,42 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
2
+ import os
5
3
 
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.utils.io_utils import jsonl_to_list
4
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
9
  from evalscope.utils.logger import get_logger
10
- from .base import T2IBaseAdapter
11
10
 
12
11
  logger = get_logger()
13
12
 
14
13
 
15
- @Benchmark.register(
16
- name='general_t2i',
17
- dataset_id='general_t2i',
18
- model_adapter=OutputType.IMAGE_GENERATION,
19
- output_types=[OutputType.IMAGE_GENERATION],
20
- subset_list=['default'],
21
- metric_list=['PickScore'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='general_t2i',
17
+ dataset_id='general_t2i',
18
+ description='General Text-to-Image Benchmark',
19
+ tags=[Tags.TEXT_TO_IMAGE],
20
+ subset_list=['default'],
21
+ metric_list=['PickScore'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ )
25
26
  )
26
- class GeneralT2IAdapter(T2IBaseAdapter):
27
+ class GeneralT2IAdapter(Text2ImageAdapter):
27
28
 
28
29
  def __init__(self, **kwargs):
29
30
 
30
31
  super().__init__(**kwargs)
31
32
 
32
- def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
33
- dataset_name_or_path = dataset_name_or_path or self.dataset_id
34
- subset_list = subset_list or self.subset_list
35
-
36
- data_file_dict = defaultdict(str)
37
- data_item_dict = defaultdict(list)
38
-
39
- # get data file path and subset name
40
- if os.path.isdir(dataset_name_or_path):
41
- for subset_name in subset_list:
42
- data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
43
- elif os.path.isfile(dataset_name_or_path):
44
- cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
45
- data_file_dict[cur_subset_name] = dataset_name_or_path
46
- else:
47
- raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
48
-
49
- # load data from local disk
50
- try:
51
- for subset_name, file_path in data_file_dict.items():
52
- data_item_dict[subset_name] = jsonl_to_list(file_path)
53
- except Exception as e:
54
- raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
55
-
56
- data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
57
-
58
- return data_dict
33
+ def load_from_disk(self, **kwargs):
34
+ if os.path.isfile(self.dataset_id):
35
+ file_name = os.path.basename(self.dataset_id)
36
+ file_without_ext = os.path.splitext(file_name)[0]
37
+ self.subset_list = [file_without_ext]
38
+
39
+ return super().load_from_disk(use_local_loader=True)
40
+
41
+ def record_to_sample(self, record):
42
+ return Sample(input=[ChatMessageUser(content=record['prompt'])], metadata={'image_path': record['image_path']})
@@ -1,57 +1,47 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
2
+ import os
5
3
 
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.utils.io_utils import jsonl_to_list
4
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
9
  from evalscope.utils.logger import get_logger
10
- from .base import T2IBaseAdapter
11
10
 
12
11
  logger = get_logger()
13
12
 
14
13
 
15
- @Benchmark.register(
16
- name='hpdv2',
17
- dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- model_adapter=OutputType.IMAGE_GENERATION,
19
- output_types=[OutputType.IMAGE_GENERATION],
20
- subset_list=['HPDv2'],
21
- metric_list=['HPSv2.1Score'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='hpdv2',
17
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
+ description='HPDv2 Text-to-Image Benchmark',
19
+ tags=[Tags.TEXT_TO_IMAGE],
20
+ subset_list=['HPDv2'],
21
+ metric_list=['HPSv2.1Score'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ )
25
26
  )
26
- class HPDv2Adapter(T2IBaseAdapter):
27
+ class HPDv2Adapter(Text2ImageAdapter):
27
28
 
28
29
  def __init__(self, **kwargs):
29
30
  super().__init__(**kwargs)
30
31
 
31
- def load(self, **kwargs) -> dict:
32
+ def load_from_disk(self, **kwargs):
32
33
  if os.path.isfile(self.dataset_id):
33
- data_list = jsonl_to_list(self.dataset_id)
34
- data_dict = {self.subset_list[0]: {'test': data_list}}
35
- return data_dict
36
- else:
37
- return super().load(**kwargs)
38
-
39
- def get_gold_answer(self, input_d: dict) -> dict:
40
- # return prompt and elements dict
41
- return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
42
-
43
- def match(self, gold: dict, pred: str) -> dict:
44
- # dummy match for general t2i
45
- # pred is the image path, gold is the prompt
46
- res = {}
47
- for metric_name, metric_func in self.metrics.items():
48
- score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
49
-
50
- res[metric_name] = score.cpu().item()
51
-
52
- # fine-granular metrics
53
- category = gold['tags'].get('category')
54
- if category:
55
- res[f'{metric_name}_{category}'] = score.cpu().item()
56
-
57
- return res
34
+ file_name = os.path.basename(self.dataset_id)
35
+ file_without_ext = os.path.splitext(file_name)[0]
36
+ self.subset_list = [file_without_ext]
37
+
38
+ return super().load_from_disk(use_local_loader=True)
39
+
40
+ def record_to_sample(self, record):
41
+ return Sample(
42
+ input=[ChatMessageUser(content=record['prompt'])],
43
+ metadata={
44
+ 'category': record.get('tags', {}).get('category', ''),
45
+ 'tags': record.get('tags', {})
46
+ }
47
+ )
@@ -1,37 +1,26 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.utils.io_utils import jsonl_to_list
2
+ from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
9
5
  from evalscope.utils.logger import get_logger
10
- from .base import T2IBaseAdapter
11
6
 
12
7
  logger = get_logger()
13
8
 
14
9
 
15
- @Benchmark.register(
16
- name='tifa160',
17
- dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- model_adapter=OutputType.IMAGE_GENERATION,
19
- output_types=[OutputType.IMAGE_GENERATION],
20
- subset_list=['TIFA-160'],
21
- metric_list=['PickScore'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
10
+ @register_benchmark(
11
+ BenchmarkMeta(
12
+ name='tifa160',
13
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
14
+ description='TIFA-160 Text-to-Image Benchmark',
15
+ tags=[Tags.TEXT_TO_IMAGE],
16
+ subset_list=['TIFA-160'],
17
+ metric_list=['PickScore'],
18
+ few_shot_num=0,
19
+ train_split=None,
20
+ eval_split='test',
21
+ )
25
22
  )
26
- class TIFA_Adapter(T2IBaseAdapter):
23
+ class TIFA_Adapter(Text2ImageAdapter):
27
24
 
28
25
  def __init__(self, **kwargs):
29
26
  super().__init__(**kwargs)
30
-
31
- def load(self, **kwargs) -> dict:
32
- if os.path.isfile(self.dataset_id):
33
- data_list = jsonl_to_list(self.dataset_id)
34
- data_dict = {self.subset_list[0]: {'test': data_list}}
35
- return data_dict
36
- else:
37
- return super().load(**kwargs)