evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1 @@
1
+ from .filter import Filter, FilterEnsemble, build_filter_ensemble
@@ -0,0 +1,72 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any, Callable, Dict, Iterable, List, Union
4
+
5
+ from evalscope.api.registry import get_filter
6
+
7
+
8
+ class Filter(ABC):
9
+ """
10
+ Filter classes operate on a sample level.
11
+ """
12
+
13
+ def __init__(self, *args, **kwargs) -> None:
14
+ """
15
+ Can define custom behavior here, if an individual instantiation of a Filter class should have state.
16
+ """
17
+
18
+ @abstractmethod
19
+ def apply(self, instance: List[str]) -> List[str]:
20
+
21
+ return instance
22
+
23
+ def __call__(self, instance: str) -> str:
24
+ """
25
+ Allows the filter to be called like a function.
26
+ """
27
+ return self.apply([instance])[0]
28
+
29
+
30
+ @dataclass
31
+ class FilterEnsemble:
32
+ """
33
+ FilterEnsemble creates a pipeline applying multiple filters.
34
+ Its intended usage is to stack multiple post-processing steps in order.
35
+ """
36
+
37
+ name: str
38
+ filters: List[Callable[[], Filter]]
39
+
40
+ def apply(self, instance: List[str]) -> List[str]:
41
+
42
+ for f in self.filters:
43
+ # apply filters in sequence
44
+ instance = f.apply(instance)
45
+
46
+ return instance
47
+
48
+ def __call__(self, instance: str) -> str:
49
+ """
50
+ Allows the filter ensemble to be called like a function.
51
+ """
52
+ return self.apply([instance])[0]
53
+
54
+
55
+ def build_filter_ensemble(name: str = 'default', filters: Dict[str, Any] = {}) -> FilterEnsemble:
56
+ """
57
+ Create a filtering pipeline.
58
+ """
59
+ filter_funcs = []
60
+ for filter_name, filter_args in filters.items():
61
+ filter_cls = get_filter(filter_name)
62
+ if isinstance(filter_args, list):
63
+ filter_function = filter_cls(*filter_args)
64
+ elif isinstance(filter_args, dict):
65
+ filter_function = filter_cls(**filter_args)
66
+ else:
67
+ # Assume single value for simple filters
68
+ filter_function = filter_cls(filter_args)
69
+ # add the filter as a pipeline step
70
+ filter_funcs.append(filter_function)
71
+
72
+ return FilterEnsemble(name=name, filters=filter_funcs)
@@ -0,0 +1,12 @@
1
+ from .chat_message import (
2
+ ChatMessage,
3
+ ChatMessageAssistant,
4
+ ChatMessageSystem,
5
+ ChatMessageTool,
6
+ ChatMessageUser,
7
+ dict_to_chat_message,
8
+ messages_pretty_str,
9
+ messages_to_markdown,
10
+ )
11
+ from .content import Content, ContentAudio, ContentData, ContentImage, ContentReasoning, ContentText, ContentVideo
12
+ from .utils import parse_content_with_reasoning
@@ -0,0 +1,243 @@
1
+ import uuid
2
+ from pydantic import BaseModel, Field, JsonValue, model_validator
3
+ from typing import Any, Dict, List, Literal, Optional, Type, Union
4
+
5
+ from evalscope.api.tool import ToolCall, ToolCallError
6
+ from .content import Content, ContentImage, ContentReasoning, ContentText
7
+ from .utils import parse_content_with_reasoning
8
+
9
+
10
+ class ChatMessageBase(BaseModel):
11
+ """Base class for chat messages."""
12
+
13
+ id: Optional[str] = Field(default=None)
14
+ """Unique identifer for message."""
15
+
16
+ content: Union[str, List[Content]]
17
+ """Content (simple string or list of content objects)"""
18
+
19
+ source: Optional[Literal['input', 'generate']] = Field(default=None)
20
+ """Source of message."""
21
+
22
+ metadata: Optional[Dict[str, Any]] = Field(default=None)
23
+ """Additional message metadata."""
24
+
25
+ internal: Optional[JsonValue] = Field(default=None)
26
+ """Model provider specific payload - typically used to aid transformation back to model types."""
27
+
28
+ def model_post_init(self, __context: Any) -> None:
29
+ # Generate ID
30
+ if self.id is None:
31
+ self.id = uuid.uuid4().hex[:8] # Shorten to 8 characters for simplicity
32
+
33
+ @property
34
+ def text(self) -> str:
35
+ """Get the text content of this message.
36
+
37
+ ChatMessage content is very general and can contain either
38
+ a simple text value or a list of content parts (each of which
39
+ can either be text or an image). Solvers (e.g. for prompt
40
+ engineering) often need to interact with chat messages with
41
+ the assumption that they are a simple string. The text
42
+ property returns either the plain str content, or if the
43
+ content is a list of text and images, the text items
44
+ concatenated together (separated by newline)
45
+ """
46
+ if isinstance(self.content, str):
47
+ return self.content
48
+ else:
49
+ all_text = [content.text for content in self.content if content.type == 'text']
50
+ return '\n'.join(all_text)
51
+
52
+ @text.setter
53
+ def text(self, text: str) -> None:
54
+ """Set the primary text content for this message.
55
+
56
+ ChatMessage content is very general and can contain either
57
+ a simple text value or a list of content parts (each of which
58
+ can either be text or an image). Solvers (e.g. for prompt
59
+ engineering) often need to interact with chat messages with
60
+ the assumption that they are a simple string. The text property
61
+ sets text either to content directly (if it is a `str`) or to
62
+ the first text content item in the message (inserting one at
63
+ the beginning if necessary). If there are multiple text content
64
+ items in the message then after the set there will be only
65
+ one remaining (image content will remain).
66
+ """
67
+ if isinstance(self.content, str):
68
+ self.content = text
69
+ else:
70
+ all_other = [content for content in self.content if content.type != 'text']
71
+ self.content = all_other + [ContentText(text=text)]
72
+
73
+
74
+ class ChatMessageSystem(ChatMessageBase):
75
+ """System chat message."""
76
+
77
+ role: Literal['system'] = Field(default='system')
78
+ """Conversation role."""
79
+
80
+
81
+ class ChatMessageUser(ChatMessageBase):
82
+ """User chat message."""
83
+
84
+ role: Literal['user'] = Field(default='user')
85
+ """Conversation role."""
86
+
87
+ tool_call_id: Optional[List[str]] = Field(default=None)
88
+ """ID(s) of tool call(s) this message has the content payload for."""
89
+
90
+
91
+ class ChatMessageAssistant(ChatMessageBase):
92
+ """Assistant chat message."""
93
+
94
+ role: Literal['assistant'] = Field(default='assistant')
95
+ """Conversation role."""
96
+
97
+ tool_calls: Optional[List[ToolCall]] = Field(default=None)
98
+ """Tool calls made by the model."""
99
+
100
+ model: Optional[str] = Field(default=None)
101
+ """Model used to generate assistant message."""
102
+
103
+ # Some OpenAI compatible REST endpoints include reasoning as a field alongside
104
+ # content, however since this field doesn't exist in the OpenAI interface,
105
+ # hosting providers (so far we've seen this with Together and Groq) may
106
+ # include the reasoning in a <think></think> tag before the main response.
107
+ # We expect this pattern to be repeated elsewhere, so include this hook to
108
+ # automatically extract the reasoning content when the response is prefaced
109
+ # with a <think> block. If this ends up being an overeach we can fall back
110
+ # to each provider manually parsing out <think> using a helper function.
111
+ # The implementation isn't important here, the critical thing to establish
112
+ # is that EvalScope makes reasoning content available separately.
113
+ @model_validator(mode='before')
114
+ @classmethod
115
+ def extract_reasoning(cls, data: Any) -> Any:
116
+ if isinstance(data, dict):
117
+ # cleave apart <think> blocks
118
+ content = data.get('content', None)
119
+ if isinstance(content, str):
120
+ content_text, content_reasoning = parse_content_with_reasoning(content)
121
+ if content_reasoning:
122
+ data['content'] = [
123
+ content_reasoning,
124
+ ContentText(text=content_text),
125
+ ]
126
+ # migrate messages that has explicit 'reasoning' field
127
+ # (which was our original representation of reasoning)
128
+ reasoning = data.get('reasoning', None)
129
+ if isinstance(reasoning, str):
130
+ # ensure that content is a list
131
+ content = data.get('content', None)
132
+ if content is None:
133
+ data['content'] = []
134
+ elif isinstance(content, str):
135
+ data['content'] = [ContentText(text=content)]
136
+ elif not isinstance(content, list):
137
+ data['content'] = []
138
+ data['content'].insert(0, ContentReasoning(reasoning=reasoning))
139
+
140
+ del data['reasoning']
141
+ return data
142
+
143
+
144
+ class ChatMessageTool(ChatMessageBase):
145
+ """Tool chat message."""
146
+
147
+ role: Literal['tool'] = Field(default='tool')
148
+ """Conversation role."""
149
+
150
+ tool_call_id: Optional[str] = Field(default=None)
151
+ """ID of tool call."""
152
+
153
+ function: Optional[str] = Field(default=None)
154
+ """Name of function called."""
155
+
156
+ error: Optional[ToolCallError] = Field(default=None)
157
+ """Error which occurred during tool call."""
158
+
159
+
160
+ ChatMessage = Union[ChatMessageSystem, ChatMessageUser, ChatMessageAssistant, ChatMessageTool]
161
+ """Message in a chat conversation"""
162
+
163
+
164
+ def dict_to_chat_message(data: Dict[str, Any]) -> ChatMessage:
165
+ """Convert a dictionary to a ChatMessage."""
166
+
167
+ if isinstance(data, ChatMessage):
168
+ return data
169
+
170
+ if 'role' not in data:
171
+ raise ValueError('ChatMessage must have a "role" field')
172
+
173
+ role = data['role']
174
+ if role == 'system':
175
+ return ChatMessageSystem.model_validate(data)
176
+ elif role == 'user':
177
+ return ChatMessageUser.model_validate(data)
178
+ elif role == 'assistant':
179
+ return ChatMessageAssistant.model_validate(data)
180
+ elif role == 'tool':
181
+ return ChatMessageTool.model_validate(data)
182
+ else:
183
+ raise ValueError(f'Unknown chat message role: {role}')
184
+
185
+
186
+ def messages_pretty_str(messages: List[ChatMessage]) -> str:
187
+ """Pretty print a list of chat messages. Without images or other multi-modal contents."""
188
+ output = []
189
+ for message in messages:
190
+ role = message.role.capitalize()
191
+ content = message.text
192
+ if isinstance(message, ChatMessageTool):
193
+ if message.error:
194
+ content += f'\nError: {message.error.message}'
195
+ if message.function:
196
+ content += f'\nFunction: {message.function}'
197
+ output.append(f'**{role}**: {content}')
198
+ return '\n\n'.join(output)
199
+
200
+
201
+ def messages_to_markdown(messages: List[ChatMessage], max_length: Optional[int] = None) -> str:
202
+ """Convert a list of chat messages to markdown format.
203
+
204
+ Args:
205
+ messages (List[ChatMessage]): The list of chat messages to convert.
206
+ max_length (Optional[int]): If provided, truncates the base64 string of images to this length.
207
+ """
208
+ output = []
209
+ for message in messages:
210
+ role = message.role.capitalize()
211
+
212
+ # Start with role header
213
+ content_parts = [f'**{role}**: ']
214
+
215
+ # Handle content based on type
216
+ if isinstance(message.content, str):
217
+ content_parts.append(message.content)
218
+ else:
219
+ for content_item in message.content:
220
+ if isinstance(content_item, ContentText):
221
+ content_parts.append(content_item.text)
222
+ elif isinstance(content_item, ContentImage):
223
+ # Use markdown image syntax
224
+ image_base64 = content_item.image
225
+ if max_length and len(image_base64) > max_length:
226
+ image_base64 = image_base64[:max_length]
227
+ content_parts.append(f'![image]({image_base64})')
228
+ elif isinstance(content_item, ContentReasoning):
229
+ content_parts.append(f'**Reasoning:** {content_item.reasoning}')
230
+
231
+ # Add tool-specific information
232
+ if isinstance(message, ChatMessageTool):
233
+ if message.error:
234
+ content_parts.append(f'**Error:** {message.error.message}')
235
+ if message.function:
236
+ content_parts.append(f'**Function:** {message.function}')
237
+ elif isinstance(message, ChatMessageAssistant) and message.tool_calls:
238
+ for tool_call in message.tool_calls:
239
+ content_parts.append(f'**Tool Call:** {tool_call.function}')
240
+
241
+ output.append('\n'.join(content_parts))
242
+
243
+ return '\n\n'.join(output)
@@ -0,0 +1,102 @@
1
+ from pydantic import BaseModel, Field, JsonValue
2
+ from typing import Dict, Literal, Optional, Sequence, Union
3
+
4
+
5
+ class ContentBase(BaseModel):
6
+ internal: Optional[JsonValue] = Field(default=None)
7
+ """Model provider specific payload - typically used to aid transformation back to model types."""
8
+
9
+
10
+ class ContentText(ContentBase):
11
+ """Text content."""
12
+
13
+ type: Literal['text'] = Field(default='text')
14
+ """Type."""
15
+
16
+ text: str
17
+ """Text content."""
18
+
19
+ refusal: Optional[bool] = Field(default=None)
20
+ """Was this a refusal message?"""
21
+
22
+
23
+ class ContentReasoning(ContentBase):
24
+ """Reasoning content.
25
+
26
+ See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
27
+ """ # noqa: E501
28
+
29
+ type: Literal['reasoning'] = Field(default='reasoning')
30
+ """Type."""
31
+
32
+ reasoning: str
33
+ """Reasoning content."""
34
+
35
+ signature: Optional[str] = Field(default=None)
36
+ """Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)""" # noqa: E501
37
+
38
+ redacted: bool = Field(default=False)
39
+ """Indicates that the explicit content of this reasoning block has been redacted."""
40
+
41
+
42
+ class ContentImage(ContentBase):
43
+ """Image content."""
44
+
45
+ type: Literal['image'] = Field(default='image')
46
+ """Type."""
47
+
48
+ image: str
49
+ """Either a URL of the image or the base64 encoded image data."""
50
+
51
+ detail: Literal['auto', 'low', 'high'] = Field(default='auto')
52
+ """Specifies the detail level of the image.
53
+
54
+ Currently only supported for OpenAI. Learn more in the [Vision guide](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
55
+ """ # noqa: E501
56
+
57
+
58
+ class ContentAudio(ContentBase):
59
+ """Audio content."""
60
+
61
+ type: Literal['audio'] = Field(default='audio')
62
+ """Type."""
63
+
64
+ audio: str
65
+ """Audio file path or base64 encoded data URL."""
66
+
67
+ format: Literal['wav', 'mp3']
68
+ """Format of audio data ('mp3' or 'wav')"""
69
+
70
+
71
+ class ContentVideo(ContentBase):
72
+ """Video content."""
73
+
74
+ type: Literal['video'] = Field(default='video')
75
+ """Type."""
76
+
77
+ video: str
78
+ """Audio file path or base64 encoded data URL."""
79
+
80
+ format: Literal['mp4', 'mpeg', 'mov']
81
+ """Format of video data ('mp4', 'mpeg', or 'mov')"""
82
+
83
+
84
+ class ContentData(ContentBase):
85
+ """Model internal."""
86
+
87
+ type: Literal['data'] = Field(default='data')
88
+ """Type."""
89
+
90
+ data: Dict[str, JsonValue]
91
+ """Model provider specific payload - required for internal content."""
92
+
93
+
94
+ Content = Union[
95
+ ContentText,
96
+ ContentReasoning,
97
+ ContentImage,
98
+ ContentAudio,
99
+ ContentVideo,
100
+ ContentData,
101
+ ]
102
+ """Content sent to or received from a model."""
@@ -0,0 +1,35 @@
1
+ import re
2
+ from typing import Optional
3
+
4
+ from .content import ContentReasoning
5
+
6
+
7
+ def parse_content_with_reasoning(content: str) -> tuple[str, Optional[ContentReasoning]]:
8
+ """
9
+ Looks for and extracts <think/> tags into reasoning text.
10
+
11
+ Returns a tuple:
12
+ - The first element is the input content with the <think> tag and its contents fully removed.
13
+ - The second element is a ContentReasoning object (or None if no <think> tag is found).
14
+ """
15
+ # Match <think> tag with optional attributes anywhere in the string
16
+ pattern = (r'<think(?:\s+signature="([^"]*)")?(?:\s+redacted="(true)")?\s*>(.*?)</think>')
17
+ match = re.search(pattern, content, re.DOTALL)
18
+
19
+ if match:
20
+ signature = match.group(1) # This will be None if not present
21
+ redacted_value = match.group(2) # This will be "true" or None
22
+ reasoning = match.group(3).strip()
23
+ # Remove the matched <think>...</think> from the input
24
+ start, end = match.span()
25
+
26
+ return (
27
+ (content[:start] + content[end:]).strip(),
28
+ ContentReasoning(
29
+ reasoning=reasoning,
30
+ signature=signature,
31
+ redacted=redacted_value == 'true',
32
+ ),
33
+ )
34
+ else:
35
+ return content, None
@@ -0,0 +1,2 @@
1
+ from .metric import Metric, T2IMetric
2
+ from .scorer import Aggregator, AggScore, SampleScore, Score, Value
@@ -0,0 +1,55 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable, Iterable, List, Union
3
+
4
+ from evalscope.utils import get_logger
5
+ from evalscope.utils.function_utils import thread_safe
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ class Metric(ABC):
11
+ """
12
+ Metric classes operate on a sample level.
13
+ """
14
+
15
+ def __init__(self, *args, **kwargs) -> None:
16
+ """
17
+ Can define custom behavior here, if an individual instantiation of a Metric class should have state.
18
+ """
19
+
20
+ @abstractmethod
21
+ def apply(self, predictions: List[str], references: List[str]) -> List[float]:
22
+ pass
23
+
24
+ def __call__(self, prediction: str, reference: str) -> float:
25
+ """
26
+ Allows the metric to be called like a function.
27
+ """
28
+ return self.apply([prediction], [reference])[0]
29
+
30
+
31
+ class T2IMetric(Metric):
32
+ _instance = None
33
+
34
+ @thread_safe
35
+ def __new__(cls, *args, **kwargs):
36
+ if cls._instance is None:
37
+ cls._instance = super().__new__(cls)
38
+ return cls._instance
39
+
40
+ def __init__(self, *args, **kwargs):
41
+ cls = self.__class__
42
+ if hasattr(self, '_init_done'):
43
+ return
44
+ logger.info(f'Initializing {cls.__name__}...')
45
+ self._init_once(*args, **kwargs)
46
+ self._init_done = True
47
+
48
+ def _init_once(self, *args, **kwargs):
49
+ pass
50
+
51
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[Union[float, dict]]:
52
+ pass
53
+
54
+ def __call__(self, image: str, text: str, **kwargs) -> Union[float, dict]:
55
+ return self.apply([image], [text], **kwargs)[0]
@@ -0,0 +1,113 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
3
+
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ logger = get_logger()
7
+
8
+ Value = Dict[str, Union[int, float, bool]]
9
+
10
+
11
+ class Score(BaseModel):
12
+ """Score generated by a scorer."""
13
+
14
+ value: Value = Field(default_factory=dict)
15
+ """Score value as a dictionary. Key is the score name, value is the score value.
16
+ The first key is considered the main score by default."""
17
+
18
+ extracted_prediction: Optional[str] = Field(default=None)
19
+ """Answer extracted from model output (optional)"""
20
+
21
+ prediction: Optional[str] = Field(default=None)
22
+ """Original prediction text from the model (optional)"""
23
+
24
+ explanation: Optional[str] = Field(default=None)
25
+ """Explanation of score (optional)."""
26
+
27
+ metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
28
+ """Additional metadata related to the score"""
29
+
30
+ main_score_name: Optional[str] = Field(default=None)
31
+ """Main score name, if applicable. This is used to indicate which score is the primary score in a multi-score scenario.""" # noqa: E501
32
+
33
+ @property
34
+ def main_value(self) -> Union[int, float, bool]:
35
+ """Main score value."""
36
+ if self.main_score_name and self.main_score_name in self.value:
37
+ return self.value[self.main_score_name]
38
+ elif self.value:
39
+ # If main_score_name is not set or not found, use the first value and update main_score_name
40
+ first_key = next(iter(self.value))
41
+ self.main_score_name = first_key
42
+ return self.value[first_key]
43
+ return None
44
+
45
+ @main_value.setter
46
+ def main_value(self, value: Union[int, float, bool]):
47
+ """Set the main score value."""
48
+ if self.main_score_name:
49
+ # If main_score_name is already set, use it
50
+ self.value[self.main_score_name] = value
51
+ elif self.value:
52
+ # If no main_score_name but value dict exists, use the first key
53
+ first_key = next(iter(self.value))
54
+ self.main_score_name = first_key
55
+ self.value[first_key] = value
56
+ else:
57
+ # If neither main_score_name nor value dict exists, initialize both
58
+ self.main_score_name = 'default'
59
+ self.value[self.main_score_name] = value
60
+
61
+
62
+ class SampleScore(BaseModel):
63
+ """Score for a Sample."""
64
+
65
+ score: Score
66
+ """A score"""
67
+
68
+ sample_id: Optional[Union[str, int]] = Field(default=None)
69
+ """A sample id"""
70
+
71
+ group_id: Optional[Union[str, int]] = Field(default=None)
72
+ """A group id for the sample, used for grouping k repeated samples."""
73
+
74
+ sample_metadata: Optional[Dict[str, Any]] = Field(default=None)
75
+ """Metadata from the sample"""
76
+
77
+
78
+ class AggScore(BaseModel):
79
+ """Output of an aggregation operation."""
80
+
81
+ score: float = Field(default=0.0)
82
+ """Aggregated value as a float."""
83
+
84
+ metric_name: str = Field(default='')
85
+ """Name of the metric being aggregated."""
86
+
87
+ aggregation_name: str = Field(default='')
88
+ """Name of the aggregation methods"""
89
+
90
+ num: int = Field(default=0)
91
+ """Number of samples used in the aggregation."""
92
+
93
+ ids: Optional[List[Union[str, int]]] = Field(default=None)
94
+ """List of sample IDs used in the aggregation, if applicable."""
95
+
96
+ metadata: Optional[Dict[str, Any]] = Field(default=None)
97
+ """Additional metadata related to the aggregation."""
98
+
99
+
100
+ class Aggregator:
101
+
102
+ name = 'default'
103
+
104
+ def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
105
+ r"""Aggregate a metric on a list of scores.
106
+
107
+ Args:
108
+ scores: List of scores.
109
+
110
+ Returns:
111
+ List[AggregatOutput]: List of aggregated outputs.
112
+ """
113
+ ...
@@ -0,0 +1 @@
1
+ from .llm_judge_mixin import LLMJudgeMixin