evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,2 @@
1
+ from .extraction import *
2
+ from .selection import *
@@ -0,0 +1,126 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from evalscope.api.filter import Filter
5
+ from evalscope.api.registry import register_filter
6
+
7
+
8
+ @register_filter('regex')
9
+ class RegexFilter(Filter):
10
+ """A filter that extracts values from text using regex pattern matching.
11
+
12
+ This filter applies a regex pattern to each model response and extracts matched values.
13
+ If no match is found, returns a fallback value. Useful for extracting structured data
14
+ (like numbers) from unstructured model outputs.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ regex_pattern: str = r'#### (\-?[0-9\.\,]+)',
20
+ group_select: int = 0,
21
+ fallback: str = '[invalid]',
22
+ ) -> None:
23
+ """
24
+ pass a string `regex` to run `re.compile(r"regex")` on.
25
+ `fallback` defines the output returned if no matches for the regex are located.
26
+ """
27
+ self.regex_pattern = regex_pattern
28
+ self.regex = re.compile(regex_pattern)
29
+ self.group_select = group_select
30
+ self.fallback = fallback
31
+
32
+ def apply(self, instance: List[str]) -> List[str]:
33
+ """Apply regex pattern to each string in the instance list."""
34
+ filtered = []
35
+ for resp in instance:
36
+ match = self.regex.findall(resp)
37
+ if match:
38
+ match = match[self.group_select]
39
+ if isinstance(match, tuple):
40
+ match = [m for m in match if m]
41
+ if match:
42
+ match = match[0]
43
+ else:
44
+ match = self.fallback
45
+ match = match.strip()
46
+ else:
47
+ match = self.fallback
48
+ filtered.append(match)
49
+ return filtered
50
+
51
+
52
+ @register_filter('regex_pos')
53
+ class POSFilter(Filter):
54
+ """ """
55
+
56
+ def __init__(
57
+ self,
58
+ regex_pattern: str = r"\['(.*?)'\]",
59
+ group_select=0,
60
+ fallback=None,
61
+ ) -> None:
62
+ """
63
+ pass a string `regex` to run `re.compile(r"regex")` on.
64
+ `fallback` defines the output returned if no matches for the regex are located.
65
+ """
66
+ if fallback is None:
67
+ fallback = ['invalid']
68
+ self.regex_pattern = regex_pattern
69
+ self.regex = re.compile(regex_pattern)
70
+ self.group_select = group_select
71
+ self.fallback = fallback
72
+
73
+ def apply(self, instance: List[str]) -> List[str]:
74
+ """Extract POS tags from each string in the instance list."""
75
+
76
+ def extract_tagged_tokens(text):
77
+ # Extract tagged tokens list from text input using regex
78
+ tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text)
79
+ return [(token, pos) for token, pos in tokens]
80
+
81
+ def extract_pos_tags(result):
82
+ pos_tags = []
83
+ if isinstance(result, str):
84
+ result = extract_tagged_tokens(result)
85
+ pos_tags.extend(pos for _, pos in result)
86
+ return pos_tags if pos_tags else self.fallback
87
+
88
+ filtered = []
89
+ for resp in instance:
90
+ match = extract_pos_tags(resp)
91
+ filtered.append(str(match))
92
+ return filtered
93
+
94
+
95
+ @register_filter('remove_whitespace')
96
+ class WhitespaceFilter(Filter):
97
+ """Filters out leading whitespace from responses."""
98
+
99
+ def apply(self, instance: List[str]) -> List[str]:
100
+ """Remove leading whitespace from each string in the instance list."""
101
+ filtered_resp = []
102
+ for resp in instance:
103
+ resp = resp.lstrip()
104
+ filtered_resp.append(resp)
105
+ return filtered_resp
106
+
107
+
108
+ @register_filter('remove_until')
109
+ class RemoveUntilFilter(Filter):
110
+ """Filters out all text until a specified delimiter is found."""
111
+
112
+ def __init__(self, delimiter: str) -> None:
113
+ self.delimiter = delimiter
114
+
115
+ def apply(self, instance: List[str]) -> List[str]:
116
+ """Remove all text until the delimiter from each string in the instance list."""
117
+ filtered_resp = []
118
+ for resp in instance:
119
+ resp = resp.split(self.delimiter, 1)[-1]
120
+ filtered_resp.append(resp)
121
+ return filtered_resp
122
+
123
+
124
+ @register_filter('extract')
125
+ class ExtractFilter(RegexFilter):
126
+ ...
@@ -0,0 +1,57 @@
1
+ from collections import Counter
2
+ from typing import List
3
+
4
+ from evalscope.api.filter import Filter
5
+ from evalscope.api.registry import register_filter
6
+
7
+
8
+ @register_filter('take_first')
9
+ class TakeFirstFilter(Filter):
10
+
11
+ def __init__(self) -> None:
12
+ """
13
+ Can define custom behavior here, if an individual instantiation of a Filter class should have state.
14
+ """
15
+
16
+ def apply(self, instance: List[str]) -> List[str]:
17
+ """
18
+ Take only the first response from the instance list.
19
+ """
20
+ return [instance[0]] if instance else []
21
+
22
+
23
+ @register_filter('take_first_k')
24
+ class TakeKFilter(Filter):
25
+
26
+ def __init__(self, **kwargs) -> None:
27
+ self.k = kwargs.pop('k')
28
+ super().__init__(**kwargs)
29
+
30
+ def apply(self, instance: List[str]) -> List[str]:
31
+ """
32
+ Take the first k responses from the instance list.
33
+ """
34
+ assert len(instance) >= self.k, (
35
+ f'Need at least {self.k} responses to take first {self.k}, but got {len(instance)} only!'
36
+ )
37
+ return instance[:self.k]
38
+
39
+
40
+ @register_filter('majority_vote')
41
+ class MajorityVoteFilter(Filter):
42
+
43
+ def __init__(self) -> None:
44
+ """
45
+ Can define custom behavior here, if an individual instantiation of a Filter class should have state.
46
+ """
47
+
48
+ def apply(self, instance: List[str]) -> List[str]:
49
+ """
50
+ Select the response that occurs most frequently in the instance list.
51
+ """
52
+ if not instance:
53
+ return []
54
+
55
+ counts = Counter(instance)
56
+ vote = counts.most_common(1)[0][0]
57
+ return [vote]
@@ -4,12 +4,18 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
8
7
  from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
9
8
  from .math_parser import extract_answer, math_equal, strip_answer_string
10
- from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
11
- weighted_mean)
12
- from .named_metrics import Metric, metric_registry
9
+ from .metric import PassAtK
10
+ from .metrics import (
11
+ bleu_ngram_one_sample,
12
+ exact_match,
13
+ macro_mean,
14
+ mean,
15
+ micro_mean,
16
+ simple_f1_score,
17
+ weighted_mean,
18
+ )
13
19
  from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
14
20
 
15
21
  else:
@@ -23,9 +29,8 @@ else:
23
29
  'simple_f1_score',
24
30
  'weighted_mean',
25
31
  ],
26
- 'named_metrics': [
27
- 'Metric',
28
- 'metric_registry',
32
+ 'metric': [
33
+ 'PassAtK',
29
34
  ],
30
35
  'rouge_metric': [
31
36
  'compute_rouge_score_one_sample_zh',
@@ -41,12 +46,7 @@ else:
41
46
  'extract_answer',
42
47
  'math_equal',
43
48
  'strip_answer_string',
44
- ],
45
- 'completion_parsers': [
46
- 'ResponseParser',
47
- 'lmsys_parser',
48
- 'ranking_parser',
49
- ],
49
+ ]
50
50
  }
51
51
 
52
52
  import sys
@@ -2,6 +2,7 @@ import os
2
2
  import re
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
+ from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
5
6
  from evalscope.constants import JudgeScoreType
6
7
  from evalscope.utils.logger import get_logger
7
8
 
@@ -48,17 +49,18 @@ class LLMJudge:
48
49
  """
49
50
 
50
51
  def __init__(
51
- self,
52
- api_key: Optional[str] = None,
53
- api_url: Optional[str] = None,
54
- model_id: Optional[str] = None,
55
- system_prompt: Optional[str] = None,
56
- prompt_template: Optional[str] = None,
57
- generation_config: Optional[Dict[str, Any]] = None,
58
- score_pattern: Optional[str] = None,
59
- score_mapping: Optional[Dict[str, float]] = None,
60
- score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
61
- **kwargs):
52
+ self,
53
+ api_key: Optional[str] = None,
54
+ api_url: Optional[str] = None,
55
+ model_id: Optional[str] = None,
56
+ system_prompt: Optional[str] = None,
57
+ prompt_template: Optional[str] = None,
58
+ generation_config: Optional[Dict[str, Any]] = None,
59
+ score_pattern: Optional[str] = None,
60
+ score_mapping: Optional[Dict[str, float]] = None,
61
+ score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
62
+ **kwargs
63
+ ):
62
64
  """
63
65
  Initialize LLMJudge metric.
64
66
 
@@ -79,14 +81,15 @@ class LLMJudge:
79
81
  self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
80
82
  self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
81
83
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
82
- self.generation_config = generation_config or {}
84
+ self.generation_config = generation_config or {'temperature': 0.0, 'max_tokens': 1024}
83
85
 
84
86
  # Default score mapping for A/B pattern
85
87
  self.score_type = score_type
86
88
  if self.score_type == JudgeScoreType.NUMERIC:
87
89
  self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
88
- self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
89
- DEFAULT_NUMERIC_SCORE_TEMPLATE)
90
+ self.prompt_template = prompt_template or os.environ.get(
91
+ 'JUDGE_PROMPT_TEMPLATE', DEFAULT_NUMERIC_SCORE_TEMPLATE
92
+ )
90
93
  elif self.score_type == JudgeScoreType.PATTERN:
91
94
  self.score_pattern = score_pattern or r'(A|B)'
92
95
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
@@ -97,36 +100,47 @@ class LLMJudge:
97
100
  self._init_server_adapter()
98
101
 
99
102
  def _init_server_adapter(self):
100
- from evalscope.models import ServerModelAdapter
101
-
102
- # Initialize ServerModelAdapter
103
- self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
104
-
105
- def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
103
+ from evalscope.api.model import GenerateConfig, get_model
104
+
105
+ self.model = get_model(
106
+ model=self.model_id,
107
+ eval_type='openai_api',
108
+ base_url=self.api_url,
109
+ api_key=self.api_key,
110
+ config=GenerateConfig(**self.generation_config),
111
+ )
112
+
113
+ def judge(
114
+ self,
115
+ prompt: str = '',
116
+ system_prompt: Optional[str] = None,
117
+ messages: Optional[List[ChatMessage]] = None
118
+ ) -> str:
106
119
  """
120
+ Generate a response from the LLM based on the provided prompt and context.
121
+ If messages is provided, it will be used as the input context.
122
+
107
123
  Args:
108
124
  prompt (str): The prompt to evaluate
109
125
  system_prompt (str, optional): The system prompt to use for the evaluation
126
+ messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
110
127
  Returns:
111
128
  str: The response from the LLM
112
129
  """
113
- input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
114
-
115
- # Inference configuration
116
- infer_cfg = {'temperature': 0.0, 'max_tokens': 1024}
117
- if self.generation_config:
118
- infer_cfg.update(self.generation_config)
119
-
120
- if self.model_id == DEFAULT_JUDGE_MODEL:
121
- # Disable thinking for the default judge model
122
- infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
123
-
130
+ # parse messages
131
+ if messages is not None:
132
+ input_messages = messages
133
+ else:
134
+ system_content = system_prompt or self.system_prompt
135
+ input_messages = [ChatMessageUser(content=prompt)]
136
+ if system_content:
137
+ input_messages.insert(0, ChatMessageSystem(content=system_content))
124
138
  try:
125
139
  # Send request using ServerModelAdapter
126
- response = self.server_adapter.process_single_input(input_data, infer_cfg)
140
+ response = self.model.generate(input_messages)
127
141
 
128
142
  # Extract content from response
129
- llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
143
+ llm_response = response.completion
130
144
  return llm_response
131
145
  except Exception as e:
132
146
  logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
@@ -153,9 +153,11 @@ def strip_answer_string(string):
153
153
 
154
154
  # cdot
155
155
  # string = string.replace("\\cdot", "")
156
- if (string.startswith('{') and string.endswith('}') and string.isalnum()
157
- or string.startswith('(') and string.endswith(')') and string.isalnum()
158
- or string.startswith('[') and string.endswith(']') and string.isalnum()):
156
+ if (
157
+ string.startswith('{') and string.endswith('}') and string.isalnum()
158
+ or string.startswith('(') and string.endswith(')') and string.isalnum()
159
+ or string.startswith('[') and string.endswith(']') and string.isalnum()
160
+ ):
159
161
  string = string[1:-1]
160
162
 
161
163
  # inf
@@ -387,9 +389,8 @@ def math_equal(
387
389
 
388
390
  ## deal with [], (), {}
389
391
  pred_str, ref_str = prediction, reference
390
- if (prediction.startswith('[') and prediction.endswith(']')
391
- and not reference.startswith('(')) or (prediction.startswith('(') and prediction.endswith(')')
392
- and not reference.startswith('[')):
392
+ if (prediction.startswith('[') and prediction.endswith(']') and not reference.startswith('(')
393
+ ) or (prediction.startswith('(') and prediction.endswith(')') and not reference.startswith('[')):
393
394
  pred_str = pred_str.strip('[]()')
394
395
  ref_str = ref_str.strip('[]()')
395
396
  for s in ['{', '}', '(', ')']:
@@ -399,25 +400,29 @@ def math_equal(
399
400
  return True
400
401
 
401
402
  ## [a, b] vs. [c, d], return a==c and b==d
402
- if (regex.match(r'(\(|\[).+(\)|\])', prediction) is not None
403
- and regex.match(r'(\(|\[).+(\)|\])', reference) is not None):
403
+ if (
404
+ regex.match(r'(\(|\[).+(\)|\])', prediction) is not None
405
+ and regex.match(r'(\(|\[).+(\)|\])', reference) is not None
406
+ ):
404
407
  pred_parts = prediction[1:-1].split(',')
405
408
  ref_parts = reference[1:-1].split(',')
406
409
  if len(pred_parts) == len(ref_parts):
407
- if all(
408
- [math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close)
409
- for i in range(len(pred_parts))]):
410
+ if all([
411
+ math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))
412
+ ]):
410
413
  return True
411
414
  if ((prediction.startswith('\\begin{pmatrix}') or prediction.startswith('\\begin{bmatrix}'))
412
- and (prediction.endswith('\\end{pmatrix}') or prediction.endswith('\\end{bmatrix}'))
413
- and (reference.startswith('\\begin{pmatrix}') or reference.startswith('\\begin{bmatrix}'))
414
- and (reference.endswith('\\end{pmatrix}') or reference.endswith('\\end{bmatrix}'))):
415
+ and (prediction.endswith('\\end{pmatrix}') or prediction.endswith('\\end{bmatrix}'))
416
+ and (reference.startswith('\\begin{pmatrix}') or reference.startswith('\\begin{bmatrix}'))
417
+ and (reference.endswith('\\end{pmatrix}') or reference.endswith('\\end{bmatrix}'))):
415
418
  pred_lines = [
416
- line.strip() for line in prediction[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
419
+ line.strip()
420
+ for line in prediction[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
417
421
  if line.strip()
418
422
  ]
419
423
  ref_lines = [
420
- line.strip() for line in reference[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
424
+ line.strip()
425
+ for line in reference[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
421
426
  if line.strip()
422
427
  ]
423
428
  matched = True
@@ -427,12 +432,12 @@ def math_equal(
427
432
  ref_parts = ref_line.split('&')
428
433
  if len(pred_parts) == len(ref_parts):
429
434
  if not all([
430
- math_equal(
431
- pred_parts[i],
432
- ref_parts[i],
433
- include_percentage,
434
- is_close,
435
- ) for i in range(len(pred_parts))
435
+ math_equal(
436
+ pred_parts[i],
437
+ ref_parts[i],
438
+ include_percentage,
439
+ is_close,
440
+ ) for i in range(len(pred_parts))
436
441
  ]):
437
442
  matched = False
438
443
  break