evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,55 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable, Iterable, List, Union
3
+
4
+ from evalscope.utils import get_logger
5
+ from evalscope.utils.function_utils import thread_safe
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ class Metric(ABC):
11
+ """
12
+ Metric classes operate on a sample level.
13
+ """
14
+
15
+ def __init__(self, *args, **kwargs) -> None:
16
+ """
17
+ Can define custom behavior here, if an individual instantiation of a Metric class should have state.
18
+ """
19
+
20
+ @abstractmethod
21
+ def apply(self, predictions: List[str], references: List[str]) -> List[float]:
22
+ pass
23
+
24
+ def __call__(self, prediction: str, reference: str) -> float:
25
+ """
26
+ Allows the metric to be called like a function.
27
+ """
28
+ return self.apply([prediction], [reference])[0]
29
+
30
+
31
+ class T2IMetric(Metric):
32
+ _instance = None
33
+
34
+ @thread_safe
35
+ def __new__(cls, *args, **kwargs):
36
+ if cls._instance is None:
37
+ cls._instance = super().__new__(cls)
38
+ return cls._instance
39
+
40
+ def __init__(self, *args, **kwargs):
41
+ cls = self.__class__
42
+ if hasattr(self, '_init_done'):
43
+ return
44
+ logger.info(f'Initializing {cls.__name__}...')
45
+ self._init_once(*args, **kwargs)
46
+ self._init_done = True
47
+
48
+ def _init_once(self, *args, **kwargs):
49
+ pass
50
+
51
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[Union[float, dict]]:
52
+ pass
53
+
54
+ def __call__(self, image: str, text: str, **kwargs) -> Union[float, dict]:
55
+ return self.apply([image], [text], **kwargs)[0]
@@ -0,0 +1,105 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
3
+
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ logger = get_logger()
7
+
8
+ Value = Dict[str, Union[int, float, bool]]
9
+
10
+
11
+ class Score(BaseModel):
12
+ """Score generated by a scorer."""
13
+
14
+ value: Value = Field(default_factory=dict)
15
+ """Score value as a dictionary. Key is the score name, value is the score value.
16
+ The first key is considered the main score by default."""
17
+
18
+ extracted_prediction: Optional[str] = Field(default=None)
19
+ """Answer extracted from model output (optional)"""
20
+
21
+ prediction: Optional[str] = Field(default=None)
22
+ """Original prediction text from the model (optional)"""
23
+
24
+ explanation: Optional[str] = Field(default=None)
25
+ """Explanation of score (optional)."""
26
+
27
+ metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
28
+ """Additional metadata related to the score"""
29
+
30
+ main_score_name: Optional[str] = Field(default=None)
31
+ """Main score name, if applicable. This is used to indicate which score is the primary score in a multi-score scenario.""" # noqa: E501
32
+
33
+ @property
34
+ def main_value(self) -> Union[int, float, bool]:
35
+ """Main score value."""
36
+ if self.main_score_name and self.main_score_name in self.value:
37
+ return self.value[self.main_score_name]
38
+ return next(iter(self.value.values()), None)
39
+
40
+ @main_value.setter
41
+ def main_value(self, value: Union[int, float, bool]):
42
+ """Set the main score value."""
43
+ if self.main_score_name:
44
+ self.value[self.main_score_name] = value
45
+ else:
46
+ # If no main score name is set, just update the first value
47
+ if self.value:
48
+ first_key = next(iter(self.value))
49
+ self.value[first_key] = value
50
+ else:
51
+ self.value['default'] = value
52
+
53
+
54
+ class SampleScore(BaseModel):
55
+ """Score for a Sample."""
56
+
57
+ score: Score
58
+ """A score"""
59
+
60
+ sample_id: Optional[Union[str, int]] = Field(default=None)
61
+ """A sample id"""
62
+
63
+ group_id: Optional[Union[str, int]] = Field(default=None)
64
+ """A group id for the sample, used for grouping k repeated samples."""
65
+
66
+ sample_metadata: Optional[Dict[str, Any]] = Field(default=None)
67
+ """Metadata from the sample"""
68
+
69
+
70
+ class AggScore(BaseModel):
71
+ """Output of an aggregation operation."""
72
+
73
+ score: float = Field(default=0.0)
74
+ """Aggregated value as a float."""
75
+
76
+ metric_name: str = Field(default='')
77
+ """Name of the metric being aggregated."""
78
+
79
+ aggregation_name: str = Field(default='')
80
+ """Name of the aggregation methods"""
81
+
82
+ num: int = Field(default=0)
83
+ """Number of samples used in the aggregation."""
84
+
85
+ ids: Optional[List[Union[str, int]]] = Field(default=None)
86
+ """List of sample IDs used in the aggregation, if applicable."""
87
+
88
+ metadata: Optional[Dict[str, Any]] = Field(default=None)
89
+ """Additional metadata related to the aggregation."""
90
+
91
+
92
+ class Aggregator:
93
+
94
+ name = 'default'
95
+
96
+ def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
97
+ r"""Aggregate a metric on a list of scores.
98
+
99
+ Args:
100
+ scores: List of scores.
101
+
102
+ Returns:
103
+ List[AggregatOutput]: List of aggregated outputs.
104
+ """
105
+ ...
@@ -0,0 +1,2 @@
1
+ from .dataset_mixin import DatasetLoaderMixin
2
+ from .llm_judge_mixin import LLMJudgeMixin
@@ -0,0 +1,105 @@
1
+ from abc import ABC
2
+ from collections import defaultdict
3
+ from typing import Any, Callable, Dict
4
+
5
+ from evalscope.api.dataset import Dataset, DatasetDict, RemoteDataLoader
6
+
7
+
8
+ class DatasetLoaderMixin:
9
+ """
10
+ Mixin class providing dataset loading functionality for benchmarks.
11
+
12
+ This mixin provides common dataset loading methods that can be shared
13
+ across different data adapters, including support for:
14
+ - Loading multiple subsets
15
+ - Few-shot dataset loading
16
+ - Remote dataset loading with configuration
17
+ """
18
+
19
+ def load_subsets(self, load_func: Callable[[str], Dataset]) -> DatasetDict:
20
+ """
21
+ Load multiple subsets of the dataset using the provided loading function.
22
+
23
+ This method handles two loading strategies:
24
+ 1. Reformat mode: Load only the default subset and reformat it
25
+ 2. Multi-subset mode: Load all subsets specified in subset_list
26
+
27
+ Args:
28
+ load_func (Callable[[str], Dataset]): Function to load individual subsets
29
+
30
+ Returns:
31
+ DatasetDict: Dictionary containing all loaded subsets
32
+ """
33
+ if self.reformat_subset:
34
+ # Load only the default subset
35
+ subset_data = load_func(self.default_subset)
36
+ # Reformat the subset to create multiple subsets based on sample keys
37
+ # NOTE: subset_list and limit is applied here if specified
38
+ dataset_dict = DatasetDict.from_dataset(dataset=subset_data, subset_list=self.subset_list, limit=self.limit)
39
+ else:
40
+ # Load all specified subsets into separate entries
41
+ subset_dict = defaultdict()
42
+ for subset in self.subset_list:
43
+ subset_data = load_func(subset)
44
+ subset_dict[subset] = subset_data
45
+ dataset_dict = DatasetDict(subset_dict)
46
+ return dataset_dict
47
+
48
+ def load_subset(self, subset: str) -> Dataset:
49
+ """
50
+ Load a specific subset of the dataset for evaluation.
51
+
52
+ This method configures and executes the data loading for a single subset,
53
+ handling both split-as-subset and traditional subset configurations.
54
+
55
+ Args:
56
+ subset (str): The subset identifier to load
57
+
58
+ Returns:
59
+ Dataset: The loaded dataset subset with processed samples
60
+ """
61
+ # Determine the split and subset names based on configuration
62
+ split = subset if self.split_as_subset else self.eval_split
63
+ subset_name = self.default_subset if self.split_as_subset else subset
64
+
65
+ # Create and configure the remote data loader
66
+ loader = RemoteDataLoader(
67
+ data_id_or_path=self.dataset_id,
68
+ split=split,
69
+ subset=subset_name,
70
+ sample_fields=self.record_to_sample, # Custom sample conversion function
71
+ limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
72
+ repeats=self._task_config.repeats, # Number of repetitions for each sample
73
+ data_source=self._task_config.dataset_hub, # Data source configuration
74
+ )
75
+ return loader.load()
76
+
77
+ def load_fewshot_subset(self, subset: str) -> Dataset:
78
+ """
79
+ Load a subset specifically for few-shot examples.
80
+
81
+ This method loads training data to be used as demonstrations in few-shot prompting.
82
+ It typically loads from the training split with limited samples and optional shuffling.
83
+
84
+ Args:
85
+ subset (str): The subset identifier to load few-shot examples from
86
+
87
+ Returns:
88
+ Dataset: The loaded few-shot dataset with demonstration examples
89
+ """
90
+ # Use training split for few-shot examples
91
+ split = subset if self.split_as_subset else self.train_split
92
+ subset_name = self.default_subset if self.split_as_subset else subset
93
+
94
+ # Create loader specifically configured for few-shot sampling
95
+ loader = RemoteDataLoader(
96
+ data_id_or_path=self.dataset_id,
97
+ split=split,
98
+ subset=subset_name,
99
+ sample_fields=self.record_to_sample,
100
+ limit=self.few_shot_num
101
+ if not self.reformat_subset else None, # Limit to specified number of few-shot examples
102
+ shuffle=self.few_shot_random, # Randomize selection if enabled
103
+ data_source=self._task_config.dataset_hub,
104
+ )
105
+ return loader.load()
@@ -0,0 +1,168 @@
1
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
2
+
3
+ from evalscope.api.evaluator import TaskState
4
+ from evalscope.api.metric import Score
5
+ from evalscope.constants import JudgeStrategy
6
+ from evalscope.metrics import LLMJudge
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ if TYPE_CHECKING:
10
+ from evalscope.config import TaskConfig
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class LLMJudgeMixin:
16
+ """
17
+ Mixin class for LLM Judge functionality.
18
+ """
19
+
20
+ def __init__(self, task_config: 'TaskConfig'):
21
+ self._task_config = task_config
22
+ self._use_llm_judge = False
23
+ """Whether to use LLM as a judge"""
24
+
25
+ self._llm_judge: Optional[LLMJudge] = None
26
+
27
+ @property
28
+ def llm_judge(self) -> Optional[LLMJudge]:
29
+ """Get LLM judge instance with lazy initialization."""
30
+ if self._llm_judge is None and self.use_llm_judge:
31
+ self._llm_judge = self.init_llm_judge()
32
+ return self._llm_judge
33
+
34
+ @llm_judge.setter
35
+ def llm_judge(self, value: Optional[LLMJudge]):
36
+ """Set LLM judge instance."""
37
+ self._llm_judge = value
38
+
39
+ @property
40
+ def judge_strategy(self) -> str:
41
+ """Get the judge strategy from the task configuration."""
42
+ return self._task_config.judge_strategy
43
+
44
+ @property
45
+ def use_llm_judge(self) -> bool:
46
+ """Check if LLM judge is enabled."""
47
+ if self.judge_strategy == JudgeStrategy.RULE:
48
+ return False
49
+ elif self.judge_strategy == JudgeStrategy.LLM:
50
+ return True
51
+ elif self.judge_strategy == JudgeStrategy.LLM_RECALL:
52
+ return True
53
+ elif self.judge_strategy == JudgeStrategy.AUTO:
54
+ return self._use_llm_judge
55
+ else:
56
+ logger.warning(f'Unknown judge strategy: {self.judge_strategy}. Defaulting to False.')
57
+ return False
58
+
59
+ def init_llm_judge(self) -> Optional[LLMJudge]:
60
+ """
61
+ Initialize the LLM judge for the benchmark.
62
+
63
+ Returns:
64
+ Optional[LLMJudge]: The initialized LLM judge instance or None
65
+ """
66
+
67
+ if self.judge_strategy == JudgeStrategy.RULE:
68
+ return None
69
+ else:
70
+ return LLMJudge(**self._task_config.judge_model_args)
71
+
72
+ def maybe_llm_match_score(
73
+ self,
74
+ original_prediction: str,
75
+ filtered_prediction: str,
76
+ reference: str,
77
+ task_state: TaskState,
78
+ rule_based_score: Optional[Score] = None,
79
+ ) -> Score:
80
+ """
81
+ Compute the match score between the original and filtered predictions against the reference.
82
+
83
+ Args:
84
+ original_prediction: The original prediction output from the model.
85
+ filtered_prediction: The filtered prediction output from the model.
86
+ reference: The ground truth reference output.
87
+ task_state: The current task state.
88
+ original_score: Optional original score to be used for comparison.
89
+
90
+ Returns:
91
+ Score: The computed match score.
92
+ """
93
+ # If LLM judge is not used, return the rule-based score directly
94
+ if not self.use_llm_judge:
95
+ return rule_based_score
96
+
97
+ # For LLM_RECALL, if rule-based score is already perfect, skip LLM judge
98
+ if float(rule_based_score.main_value) > 0.99:
99
+ return rule_based_score
100
+
101
+ # Compute LLM judge score
102
+ llm_score = self.llm_match_score(
103
+ original_prediction=original_prediction,
104
+ filtered_prediction=filtered_prediction,
105
+ reference=reference,
106
+ task_state=task_state,
107
+ )
108
+
109
+ # For LLM RECALL, merge the scores
110
+ return self._merge_scores(rule_based_score, llm_score)
111
+
112
+ def llm_match_score(
113
+ self,
114
+ original_prediction: str,
115
+ filtered_prediction: str,
116
+ reference: str,
117
+ task_state: TaskState,
118
+ ) -> Score:
119
+ """Compute the LLM match score.
120
+
121
+ Args:
122
+ original_prediction (str): The original prediction output from the model.
123
+ filtered_prediction (str): The filtered prediction output from the model.
124
+ reference (str): The ground truth reference output.
125
+ task_state (TaskState): The current task state.
126
+
127
+ Returns:
128
+ Score: The computed match score.
129
+ """
130
+ score = Score(
131
+ extracted_prediction=filtered_prediction,
132
+ prediction=original_prediction,
133
+ )
134
+
135
+ question = task_state.input_text
136
+
137
+ # Request judge and obtain score
138
+ prompt = self.llm_judge.build_prompt(pred=original_prediction, gold=reference, question=question)
139
+ judge_response = self.llm_judge.judge(prompt)
140
+ judge_score = self.llm_judge.get_score(judge_response)
141
+
142
+ score.value = {'acc': judge_score}
143
+ score.explanation = f'LLM judge: {judge_response}'
144
+ score.metadata = {
145
+ 'source': 'llm_judge',
146
+ 'judge_strategy': self.judge_strategy,
147
+ 'model': self.llm_judge.model_id
148
+ }
149
+
150
+ return score
151
+
152
+ def _merge_scores(self, rule_based_score: Score, llm_score: Score) -> Score:
153
+ """
154
+ Merge rule-based score with LLM judge score for LLM_RECALL strategy.
155
+
156
+ Args:
157
+ rule_based_score: The original rule-based score
158
+ llm_score: The LLM judge score
159
+
160
+ Returns:
161
+ Score: The merged score
162
+ """
163
+ # Update the main value with LLM judge result
164
+ rule_based_score.main_value = llm_score.main_value
165
+ rule_based_score.explanation = llm_score.explanation
166
+ rule_based_score.metadata = llm_score.metadata
167
+
168
+ return rule_based_score
@@ -0,0 +1,12 @@
1
+ from .generate_config import GenerateConfig
2
+ from .model import Model, ModelAPI, get_model, get_model_with_task_config
3
+ from .model_output import (
4
+ ChatCompletionChoice,
5
+ Logprob,
6
+ Logprobs,
7
+ ModelOutput,
8
+ ModelUsage,
9
+ StopReason,
10
+ TopLogprob,
11
+ as_stop_reason,
12
+ )
@@ -0,0 +1,157 @@
1
+ # flake8: noqa: E501
2
+ from copy import deepcopy
3
+ from pydantic import BaseModel, Field, model_validator
4
+ from typing import Any, Dict, List, Literal, Optional, Union
5
+
6
+ from evalscope.utils.json_schema import JSONSchema
7
+
8
+
9
+ class ResponseSchema(BaseModel):
10
+ """Schema for model response when using Structured Output."""
11
+
12
+ name: str
13
+ """The name of the response schema. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64."""
14
+
15
+ json_schema: JSONSchema
16
+ """The schema for the response format, described as a JSON Schema object."""
17
+
18
+ description: Optional[str] = Field(default=None)
19
+ """A description of what the response format is for, used by the model to determine how to respond in the format."""
20
+
21
+ strict: Optional[bool] = Field(default=None)
22
+ """Whether to enable strict schema adherence when generating the output. If set to true, the model will always follow the exact schema defined in the schema field.
23
+ OpenAI and Mistral only."""
24
+
25
+
26
+ class GenerateConfig(BaseModel):
27
+ """Model generation options."""
28
+
29
+ max_retries: Optional[int] = Field(default=None)
30
+ """Maximum number of times to retry request (defaults to unlimited)."""
31
+
32
+ timeout: Optional[int] = Field(default=None)
33
+ """Request timeout (in seconds)."""
34
+
35
+ batch_size: Optional[int] = Field(default=None)
36
+ """Maximum number of concurrent connections to Model API (default is model specific) or batch size for generation."""
37
+
38
+ stream: Optional[bool] = Field(default=None)
39
+ """Whether to stream the response (default is model specific)."""
40
+
41
+ system_message: Optional[str] = Field(default=None)
42
+ """Override the default system message."""
43
+
44
+ max_tokens: Optional[int] = Field(default=None)
45
+ """The maximum number of tokens that can be generated in the completion (default is model specific)."""
46
+
47
+ top_p: Optional[float] = Field(default=None)
48
+ """An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass."""
49
+
50
+ temperature: Optional[float] = Field(default=None)
51
+ """What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."""
52
+
53
+ stop_seqs: Optional[List[str]] = Field(default=None)
54
+ """Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."""
55
+
56
+ best_of: Optional[int] = Field(default=None)
57
+ """Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
58
+
59
+ frequency_penalty: Optional[float] = Field(default=None)
60
+ """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
61
+
62
+ presence_penalty: Optional[float] = Field(default=None)
63
+ """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
64
+
65
+ logit_bias: Optional[Dict[int, float]] = Field(default=None)
66
+ """Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
67
+
68
+ seed: Optional[int] = Field(default=None)
69
+ """Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
70
+
71
+ do_sample: Optional[bool] = Field(default=None)
72
+ """Whether to use sampling; use greedy decoding otherwise. Only transformers models support this parameter."""
73
+
74
+ top_k: Optional[int] = Field(default=None)
75
+ """Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, vLLM, and SGLang only."""
76
+
77
+ n: Optional[int] = Field(default=None)
78
+ """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, vLLM, and SGLang only."""
79
+
80
+ logprobs: Optional[bool] = Field(default=None)
81
+ """Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, vLLM, and SGLang only."""
82
+
83
+ top_logprobs: Optional[int] = Field(default=None)
84
+ """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, vLLM, and SGLang only."""
85
+
86
+ parallel_tool_calls: Optional[bool] = Field(default=None)
87
+ """Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
88
+
89
+ internal_tools: Optional[bool] = Field(default=None)
90
+ """Whether to automatically map tools to model internal implementations (e.g. 'computer' for anthropic)."""
91
+
92
+ max_tool_output: Optional[int] = Field(default=None)
93
+ """Maximum tool output (in bytes). Defaults to 16 * 1024."""
94
+
95
+ cache_prompt: Union[Literal['auto'], bool, None] = Field(default=None)
96
+ """Whether to cache the prompt prefix. Defaults to "auto", which will enable caching for requests with tools. Anthropic only."""
97
+
98
+ reasoning_effort: Optional[Literal['low', 'medium', 'high']] = Field(default=None)
99
+ """Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o1 models only."""
100
+
101
+ reasoning_tokens: Optional[int] = Field(default=None)
102
+ """Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
103
+
104
+ reasoning_summary: Optional[Literal['concise', 'detailed', 'auto']] = Field(default=None)
105
+ """Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only."""
106
+
107
+ reasoning_history: Optional[Literal['none', 'all', 'last', 'auto']] = Field(default=None)
108
+ """Include reasoning in chat message history sent to generate."""
109
+
110
+ response_schema: Optional[ResponseSchema] = Field(default=None)
111
+ """Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
112
+
113
+ extra_body: Optional[Dict[str, Any]] = Field(default=None)
114
+ """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
115
+
116
+ height: Optional[int] = Field(default=None)
117
+ """Image height for image generation model only"""
118
+
119
+ width: Optional[int] = Field(default=None)
120
+ """Image width for image generation model only"""
121
+
122
+ num_inference_steps: Optional[int] = Field(default=None)
123
+ """Number of inference steps for image generation model only"""
124
+
125
+ guidance_scale: Optional[float] = Field(default=None)
126
+ """Guidance scale for image generation model only"""
127
+
128
+ # migrate reasoning_history as a bool
129
+ @model_validator(mode='before')
130
+ @classmethod
131
+ def migrate_reasoning(cls, data: Any) -> Any:
132
+ if isinstance(data, dict):
133
+ reasoning_history = data.get('reasoning_history', None)
134
+ if reasoning_history is True:
135
+ data['reasoning_history'] = 'all'
136
+ elif reasoning_history is False:
137
+ data['reasoning_history'] = 'none'
138
+
139
+ return data
140
+
141
+ def merge(self, other: 'GenerateConfig') -> 'GenerateConfig':
142
+ """Merge another model configuration into this one.
143
+
144
+ Args:
145
+ other (GenerateConfig):
146
+ Configuration to merge.
147
+
148
+ Returns:
149
+ Merged configuration.
150
+ """
151
+ config_keys = [field for field in self.__class__.model_fields.keys()]
152
+ config = deepcopy(self)
153
+ for key in config_keys:
154
+ value = getattr(other, key, None)
155
+ if value is not None:
156
+ setattr(config, key, value)
157
+ return config