evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2 @@
1
+ from .extraction import *
2
+ from .selection import *
@@ -0,0 +1,126 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from evalscope.api.filter import Filter
5
+ from evalscope.api.registry import register_filter
6
+
7
+
8
+ @register_filter('regex')
9
+ class RegexFilter(Filter):
10
+ """A filter that extracts values from text using regex pattern matching.
11
+
12
+ This filter applies a regex pattern to each model response and extracts matched values.
13
+ If no match is found, returns a fallback value. Useful for extracting structured data
14
+ (like numbers) from unstructured model outputs.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ regex_pattern: str = r'#### (\-?[0-9\.\,]+)',
20
+ group_select: int = 0,
21
+ fallback: str = '[invalid]',
22
+ ) -> None:
23
+ """
24
+ pass a string `regex` to run `re.compile(r"regex")` on.
25
+ `fallback` defines the output returned if no matches for the regex are located.
26
+ """
27
+ self.regex_pattern = regex_pattern
28
+ self.regex = re.compile(regex_pattern)
29
+ self.group_select = group_select
30
+ self.fallback = fallback
31
+
32
+ def apply(self, instance: List[str]) -> List[str]:
33
+ """Apply regex pattern to each string in the instance list."""
34
+ filtered = []
35
+ for resp in instance:
36
+ match = self.regex.findall(resp)
37
+ if match:
38
+ match = match[self.group_select]
39
+ if isinstance(match, tuple):
40
+ match = [m for m in match if m]
41
+ if match:
42
+ match = match[0]
43
+ else:
44
+ match = self.fallback
45
+ match = match.strip()
46
+ else:
47
+ match = self.fallback
48
+ filtered.append(match)
49
+ return filtered
50
+
51
+
52
+ @register_filter('regex_pos')
53
+ class POSFilter(Filter):
54
+ """ """
55
+
56
+ def __init__(
57
+ self,
58
+ regex_pattern: str = r"\['(.*?)'\]",
59
+ group_select=0,
60
+ fallback=None,
61
+ ) -> None:
62
+ """
63
+ pass a string `regex` to run `re.compile(r"regex")` on.
64
+ `fallback` defines the output returned if no matches for the regex are located.
65
+ """
66
+ if fallback is None:
67
+ fallback = ['invalid']
68
+ self.regex_pattern = regex_pattern
69
+ self.regex = re.compile(regex_pattern)
70
+ self.group_select = group_select
71
+ self.fallback = fallback
72
+
73
+ def apply(self, instance: List[str]) -> List[str]:
74
+ """Extract POS tags from each string in the instance list."""
75
+
76
+ def extract_tagged_tokens(text):
77
+ # Extract tagged tokens list from text input using regex
78
+ tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text)
79
+ return [(token, pos) for token, pos in tokens]
80
+
81
+ def extract_pos_tags(result):
82
+ pos_tags = []
83
+ if isinstance(result, str):
84
+ result = extract_tagged_tokens(result)
85
+ pos_tags.extend(pos for _, pos in result)
86
+ return pos_tags if pos_tags else self.fallback
87
+
88
+ filtered = []
89
+ for resp in instance:
90
+ match = extract_pos_tags(resp)
91
+ filtered.append(str(match))
92
+ return filtered
93
+
94
+
95
+ @register_filter('remove_whitespace')
96
+ class WhitespaceFilter(Filter):
97
+ """Filters out leading whitespace from responses."""
98
+
99
+ def apply(self, instance: List[str]) -> List[str]:
100
+ """Remove leading whitespace from each string in the instance list."""
101
+ filtered_resp = []
102
+ for resp in instance:
103
+ resp = resp.lstrip()
104
+ filtered_resp.append(resp)
105
+ return filtered_resp
106
+
107
+
108
+ @register_filter('remove_until')
109
+ class RemoveUntilFilter(Filter):
110
+ """Filters out all text until a specified delimiter is found."""
111
+
112
+ def __init__(self, delimiter: str) -> None:
113
+ self.delimiter = delimiter
114
+
115
+ def apply(self, instance: List[str]) -> List[str]:
116
+ """Remove all text until the delimiter from each string in the instance list."""
117
+ filtered_resp = []
118
+ for resp in instance:
119
+ resp = resp.split(self.delimiter, 1)[-1]
120
+ filtered_resp.append(resp)
121
+ return filtered_resp
122
+
123
+
124
+ @register_filter('extract')
125
+ class ExtractFilter(RegexFilter):
126
+ ...
@@ -0,0 +1,57 @@
1
+ from collections import Counter
2
+ from typing import List
3
+
4
+ from evalscope.api.filter import Filter
5
+ from evalscope.api.registry import register_filter
6
+
7
+
8
+ @register_filter('take_first')
9
+ class TakeFirstFilter(Filter):
10
+
11
+ def __init__(self) -> None:
12
+ """
13
+ Can define custom behavior here, if an individual instantiation of a Filter class should have state.
14
+ """
15
+
16
+ def apply(self, instance: List[str]) -> List[str]:
17
+ """
18
+ Take only the first response from the instance list.
19
+ """
20
+ return [instance[0]] if instance else []
21
+
22
+
23
+ @register_filter('take_first_k')
24
+ class TakeKFilter(Filter):
25
+
26
+ def __init__(self, **kwargs) -> None:
27
+ self.k = kwargs.pop('k')
28
+ super().__init__(**kwargs)
29
+
30
+ def apply(self, instance: List[str]) -> List[str]:
31
+ """
32
+ Take the first k responses from the instance list.
33
+ """
34
+ assert len(instance) >= self.k, (
35
+ f'Need at least {self.k} responses to take first {self.k}, but got {len(instance)} only!'
36
+ )
37
+ return instance[:self.k]
38
+
39
+
40
+ @register_filter('majority_vote')
41
+ class MajorityVoteFilter(Filter):
42
+
43
+ def __init__(self) -> None:
44
+ """
45
+ Can define custom behavior here, if an individual instantiation of a Filter class should have state.
46
+ """
47
+
48
+ def apply(self, instance: List[str]) -> List[str]:
49
+ """
50
+ Select the response that occurs most frequently in the instance list.
51
+ """
52
+ if not instance:
53
+ return []
54
+
55
+ counts = Counter(instance)
56
+ vote = counts.most_common(1)[0][0]
57
+ return [vote]
@@ -4,12 +4,18 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
8
7
  from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
9
8
  from .math_parser import extract_answer, math_equal, strip_answer_string
10
- from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
11
- weighted_mean)
12
- from .named_metrics import Metric, metric_registry
9
+ from .metric import PassAtK
10
+ from .metrics import (
11
+ bleu_ngram_one_sample,
12
+ exact_match,
13
+ macro_mean,
14
+ mean,
15
+ micro_mean,
16
+ simple_f1_score,
17
+ weighted_mean,
18
+ )
13
19
  from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
14
20
 
15
21
  else:
@@ -23,9 +29,8 @@ else:
23
29
  'simple_f1_score',
24
30
  'weighted_mean',
25
31
  ],
26
- 'named_metrics': [
27
- 'Metric',
28
- 'metric_registry',
32
+ 'metric': [
33
+ 'PassAtK',
29
34
  ],
30
35
  'rouge_metric': [
31
36
  'compute_rouge_score_one_sample_zh',
@@ -41,12 +46,7 @@ else:
41
46
  'extract_answer',
42
47
  'math_equal',
43
48
  'strip_answer_string',
44
- ],
45
- 'completion_parsers': [
46
- 'ResponseParser',
47
- 'lmsys_parser',
48
- 'ranking_parser',
49
- ],
49
+ ]
50
50
  }
51
51
 
52
52
  import sys
@@ -48,17 +48,18 @@ class LLMJudge:
48
48
  """
49
49
 
50
50
  def __init__(
51
- self,
52
- api_key: Optional[str] = None,
53
- api_url: Optional[str] = None,
54
- model_id: Optional[str] = None,
55
- system_prompt: Optional[str] = None,
56
- prompt_template: Optional[str] = None,
57
- generation_config: Optional[Dict[str, Any]] = None,
58
- score_pattern: Optional[str] = None,
59
- score_mapping: Optional[Dict[str, float]] = None,
60
- score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
61
- **kwargs):
51
+ self,
52
+ api_key: Optional[str] = None,
53
+ api_url: Optional[str] = None,
54
+ model_id: Optional[str] = None,
55
+ system_prompt: Optional[str] = None,
56
+ prompt_template: Optional[str] = None,
57
+ generation_config: Optional[Dict[str, Any]] = None,
58
+ score_pattern: Optional[str] = None,
59
+ score_mapping: Optional[Dict[str, float]] = None,
60
+ score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
61
+ **kwargs
62
+ ):
62
63
  """
63
64
  Initialize LLMJudge metric.
64
65
 
@@ -79,14 +80,15 @@ class LLMJudge:
79
80
  self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
80
81
  self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
81
82
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
82
- self.generation_config = generation_config or {}
83
+ self.generation_config = generation_config or {'temperature': 0.0, 'max_tokens': 1024}
83
84
 
84
85
  # Default score mapping for A/B pattern
85
86
  self.score_type = score_type
86
87
  if self.score_type == JudgeScoreType.NUMERIC:
87
88
  self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
88
- self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
89
- DEFAULT_NUMERIC_SCORE_TEMPLATE)
89
+ self.prompt_template = prompt_template or os.environ.get(
90
+ 'JUDGE_PROMPT_TEMPLATE', DEFAULT_NUMERIC_SCORE_TEMPLATE
91
+ )
90
92
  elif self.score_type == JudgeScoreType.PATTERN:
91
93
  self.score_pattern = score_pattern or r'(A|B)'
92
94
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
@@ -97,12 +99,17 @@ class LLMJudge:
97
99
  self._init_server_adapter()
98
100
 
99
101
  def _init_server_adapter(self):
100
- from evalscope.models import ServerModelAdapter
102
+ from evalscope.api.model import GenerateConfig, get_model
101
103
 
102
- # Initialize ServerModelAdapter
103
- self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
104
+ self.model = get_model(
105
+ model=self.model_id,
106
+ eval_type='openai_api',
107
+ base_url=self.api_url,
108
+ api_key=self.api_key,
109
+ config=GenerateConfig(**self.generation_config),
110
+ )
104
111
 
105
- def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
112
+ def judge(self, prompt: str, system_prompt: Optional[str] = None) -> str:
106
113
  """
107
114
  Args:
108
115
  prompt (str): The prompt to evaluate
@@ -110,23 +117,18 @@ class LLMJudge:
110
117
  Returns:
111
118
  str: The response from the LLM
112
119
  """
113
- input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
114
-
115
- # Inference configuration
116
- infer_cfg = {'temperature': 0.0, 'max_tokens': 1024}
117
- if self.generation_config:
118
- infer_cfg.update(self.generation_config)
119
-
120
- if self.model_id == DEFAULT_JUDGE_MODEL:
121
- # Disable thinking for the default judge model
122
- infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
120
+ from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
123
121
 
122
+ system_content = system_prompt or self.system_prompt
123
+ input_messages = [ChatMessageUser(content=prompt)]
124
+ if system_content:
125
+ input_messages.insert(0, ChatMessageSystem(content=system_content))
124
126
  try:
125
127
  # Send request using ServerModelAdapter
126
- response = self.server_adapter.process_single_input(input_data, infer_cfg)
128
+ response = self.model.generate(input_messages)
127
129
 
128
130
  # Extract content from response
129
- llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
131
+ llm_response = response.completion
130
132
  return llm_response
131
133
  except Exception as e:
132
134
  logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
@@ -153,9 +153,11 @@ def strip_answer_string(string):
153
153
 
154
154
  # cdot
155
155
  # string = string.replace("\\cdot", "")
156
- if (string.startswith('{') and string.endswith('}') and string.isalnum()
157
- or string.startswith('(') and string.endswith(')') and string.isalnum()
158
- or string.startswith('[') and string.endswith(']') and string.isalnum()):
156
+ if (
157
+ string.startswith('{') and string.endswith('}') and string.isalnum()
158
+ or string.startswith('(') and string.endswith(')') and string.isalnum()
159
+ or string.startswith('[') and string.endswith(']') and string.isalnum()
160
+ ):
159
161
  string = string[1:-1]
160
162
 
161
163
  # inf
@@ -387,9 +389,8 @@ def math_equal(
387
389
 
388
390
  ## deal with [], (), {}
389
391
  pred_str, ref_str = prediction, reference
390
- if (prediction.startswith('[') and prediction.endswith(']')
391
- and not reference.startswith('(')) or (prediction.startswith('(') and prediction.endswith(')')
392
- and not reference.startswith('[')):
392
+ if (prediction.startswith('[') and prediction.endswith(']') and not reference.startswith('(')
393
+ ) or (prediction.startswith('(') and prediction.endswith(')') and not reference.startswith('[')):
393
394
  pred_str = pred_str.strip('[]()')
394
395
  ref_str = ref_str.strip('[]()')
395
396
  for s in ['{', '}', '(', ')']:
@@ -399,25 +400,29 @@ def math_equal(
399
400
  return True
400
401
 
401
402
  ## [a, b] vs. [c, d], return a==c and b==d
402
- if (regex.match(r'(\(|\[).+(\)|\])', prediction) is not None
403
- and regex.match(r'(\(|\[).+(\)|\])', reference) is not None):
403
+ if (
404
+ regex.match(r'(\(|\[).+(\)|\])', prediction) is not None
405
+ and regex.match(r'(\(|\[).+(\)|\])', reference) is not None
406
+ ):
404
407
  pred_parts = prediction[1:-1].split(',')
405
408
  ref_parts = reference[1:-1].split(',')
406
409
  if len(pred_parts) == len(ref_parts):
407
- if all(
408
- [math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close)
409
- for i in range(len(pred_parts))]):
410
+ if all([
411
+ math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))
412
+ ]):
410
413
  return True
411
414
  if ((prediction.startswith('\\begin{pmatrix}') or prediction.startswith('\\begin{bmatrix}'))
412
- and (prediction.endswith('\\end{pmatrix}') or prediction.endswith('\\end{bmatrix}'))
413
- and (reference.startswith('\\begin{pmatrix}') or reference.startswith('\\begin{bmatrix}'))
414
- and (reference.endswith('\\end{pmatrix}') or reference.endswith('\\end{bmatrix}'))):
415
+ and (prediction.endswith('\\end{pmatrix}') or prediction.endswith('\\end{bmatrix}'))
416
+ and (reference.startswith('\\begin{pmatrix}') or reference.startswith('\\begin{bmatrix}'))
417
+ and (reference.endswith('\\end{pmatrix}') or reference.endswith('\\end{bmatrix}'))):
415
418
  pred_lines = [
416
- line.strip() for line in prediction[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
419
+ line.strip()
420
+ for line in prediction[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
417
421
  if line.strip()
418
422
  ]
419
423
  ref_lines = [
420
- line.strip() for line in reference[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
424
+ line.strip()
425
+ for line in reference[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
421
426
  if line.strip()
422
427
  ]
423
428
  matched = True
@@ -427,12 +432,12 @@ def math_equal(
427
432
  ref_parts = ref_line.split('&')
428
433
  if len(pred_parts) == len(ref_parts):
429
434
  if not all([
430
- math_equal(
431
- pred_parts[i],
432
- ref_parts[i],
433
- include_percentage,
434
- is_close,
435
- ) for i in range(len(pred_parts))
435
+ math_equal(
436
+ pred_parts[i],
437
+ ref_parts[i],
438
+ include_percentage,
439
+ is_close,
440
+ ) for i in range(len(pred_parts))
436
441
  ]):
437
442
  matched = False
438
443
  break