evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,264 @@
1
+ from dataclasses import dataclass
2
+ from random import Random
3
+ from typing import Any, Dict, List, Optional, Sequence, Union, overload
4
+
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessage, ChatMessageUser, messages_pretty_str
7
+ from evalscope.api.model import ModelOutput
8
+
9
+
10
+ class Target(Sequence[str]):
11
+ """Target for scoring against the current TaskState.
12
+
13
+ Target is a sequence of one or more strings. Use the
14
+ `text` property to access the value as a single string.
15
+ """
16
+
17
+ def __init__(self, target: Union[str, List[str]]) -> None:
18
+ self.target = target if isinstance(target, list) else [target]
19
+
20
+ @overload
21
+ def __getitem__(self, index: int) -> str:
22
+ ...
23
+
24
+ @overload
25
+ def __getitem__(self, index: slice) -> Sequence[str]:
26
+ ...
27
+
28
+ def __getitem__(self, index: Union[int, slice]) -> Union[str, Sequence[str]]:
29
+ return self.target[index]
30
+
31
+ def __len__(self) -> int:
32
+ return len(self.target)
33
+
34
+ @property
35
+ def text(self) -> str:
36
+ return ''.join(self.target)
37
+
38
+
39
+ @dataclass
40
+ class Choice:
41
+ """
42
+ A `Choice` represents a single choice in a multiple choice question.
43
+
44
+ It is only relevant for the `multiple_choice` solver and corresponding
45
+ `choice` scorer.
46
+ """
47
+
48
+ value: str
49
+ """The original value of the choice from the `Sample`."""
50
+
51
+ correct: Optional[bool]
52
+ """Did the model think this choice satisfies the question? `None`
53
+ indicates this has not been set yet"""
54
+
55
+ original_position: int
56
+ """Choices may be re-ordered during processing, this represents the
57
+ original position in the sample's list of choices"""
58
+
59
+
60
+ class Choices(Sequence[Choice]):
61
+ """
62
+ Wrapper class for a list of `Choice` objects.
63
+
64
+ Primarily simply to abstract away implementations of choice-specific
65
+ functionality from the already-big `TaskState` class.
66
+ """
67
+
68
+ def __init__(self, choices: Union[List[str], List[Choice]]) -> None:
69
+ """
70
+ Setter for choices, intended to only be used with the `multiple_choice` scorer.
71
+
72
+ Choices come from a list of choices for the sample, specifically used by
73
+ the `multiple_choice` scorer.
74
+
75
+ For example, if the sample was a multiple choice question like "What is
76
+ the capital of France? A) Paris B) London C) Berlin", we would store the
77
+ possible answers here.
78
+ """
79
+ self._choices: List[Choice] = []
80
+
81
+ for i, choice in enumerate(choices):
82
+ if isinstance(choice, str):
83
+ self._choices.append(Choice(value=choice, correct=None, original_position=i))
84
+ elif isinstance(choice, Choice):
85
+ self._choices.append(choice)
86
+
87
+ @overload
88
+ def __getitem__(self, index: int) -> Choice:
89
+ ...
90
+
91
+ @overload
92
+ def __getitem__(self, index: slice) -> Sequence[Choice]:
93
+ ...
94
+
95
+ def __getitem__(self, index: Union[int, slice]) -> Union[Choice, Sequence[Choice]]:
96
+ return self._choices[index]
97
+
98
+ def __len__(self) -> int:
99
+ return len(self._choices)
100
+
101
+ def mark_choice(self, index: int, correct: bool) -> None:
102
+ """Set the value of a specific choice"""
103
+ self._choices[index].correct = correct
104
+
105
+ def shuffle(self, rand: Random = Random()) -> None:
106
+ """
107
+ Shuffle the choice order, setting the `original_position` so they can be mapped back to their original order.
108
+
109
+ Some evals will shuffle the choices from the original sample to try to
110
+ avoid the model answering correctly due to fine-tuning (or similar) on
111
+ specific datasets.
112
+ """
113
+ shuffled_positions = list(range(len(self._choices)))
114
+ rand.shuffle(shuffled_positions)
115
+
116
+ shuffled_choices = [Choice('notachoice', None, -1)] * len(self._choices)
117
+
118
+ for i, shuffled_position in enumerate(shuffled_positions):
119
+ shuffled_choices[i] = self._choices[shuffled_position]
120
+ shuffled_choices[i].original_position = shuffled_position
121
+
122
+ self._choices = shuffled_choices
123
+
124
+
125
+ class TaskState:
126
+ """
127
+ The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
128
+
129
+ The `TaskState` is passed to and returned from each solver during a sample's
130
+ evaluation. It allows us to maintain the manipulated message history, the tools
131
+ available to the model, the final output of the model, and whether the task
132
+ is completed or has hit a limit.
133
+ """
134
+
135
+ def __init__(
136
+ self,
137
+ model: str,
138
+ sample: Sample,
139
+ messages: List[ChatMessage] = [],
140
+ output: Optional[ModelOutput] = None,
141
+ completed: bool = False,
142
+ ) -> None:
143
+ self._model = model
144
+ self._sample = sample
145
+ self._sample_id = sample.id
146
+ self._group_id = sample.group_id
147
+ self._input = sample.input
148
+ self._target = Target(sample.target)
149
+ self._metadata = sample.metadata
150
+ self._messages: List[ChatMessage] = messages
151
+ self._output = output if output else ModelOutput(model=str(model))
152
+ self._completed = completed
153
+ if sample.choices:
154
+ self._choices = Choices(sample.choices)
155
+ else:
156
+ self._choices = Choices([])
157
+
158
+ @property
159
+ def model(self) -> str:
160
+ """Name of model being evaluated."""
161
+ return self._model
162
+
163
+ @property
164
+ def sample_id(self) -> int:
165
+ """Unique id for sample."""
166
+ return self._sample_id
167
+
168
+ @property
169
+ def group_id(self) -> int:
170
+ """Group id for sample."""
171
+ return self._group_id
172
+
173
+ @property
174
+ def input(self) -> Union[str, List[ChatMessage]]:
175
+ """Input from the `Sample`, should be considered immutable."""
176
+ return self._input
177
+
178
+ @property
179
+ def input_text(self) -> str:
180
+ """
181
+ Convenience function for accessing the initial input from the `Sample` as a string.
182
+
183
+ If the `input` is a `List[ChatMessage]`, this will return the text from
184
+ the last chat message
185
+ """
186
+ if isinstance(self._input, str):
187
+ return self._input
188
+ else:
189
+ return messages_pretty_str(self._input)
190
+
191
+ @property
192
+ def choices(self) -> Choices:
193
+ """Choices for the sample, if applicable."""
194
+ return self._choices
195
+
196
+ @property
197
+ def user_prompt(self) -> ChatMessageUser:
198
+ """User prompt for this state.
199
+
200
+ Tasks are very general and can have may types of inputs.
201
+ However, in many cases solvers assume they can interact with
202
+ the state as a "chat" in a predictable fashion (e.g. prompt
203
+ engineering solvers). This property enables easy read and
204
+ write access to the user chat prompt. Raises an
205
+ exception if there is no user prompt
206
+ """
207
+ prompt = next((m for m in reversed(self.messages) if m.role == 'user'), None)
208
+ if prompt:
209
+ return prompt
210
+ else:
211
+ raise ValueError('user_prompt requested from TaskState but none available')
212
+
213
+ @property
214
+ def metadata(self) -> Dict[str, Any]:
215
+ """Metadata from the `Sample` for this `TaskState`"""
216
+ return self._metadata
217
+
218
+ @metadata.setter
219
+ def metadata(self, metadata: Dict[str, Any]) -> None:
220
+ self._metadata = metadata
221
+
222
+ @property
223
+ def messages(self) -> List[ChatMessage]:
224
+ """
225
+ Chat conversation history for sample.
226
+
227
+ This will generally get appended to every time a `generate` call is made
228
+ to the model. Useful for both debug and for solvers/scorers to assess
229
+ model performance or choose the next step.
230
+ """
231
+ return self._messages
232
+
233
+ @messages.setter
234
+ def messages(self, messages: List[ChatMessage]) -> None:
235
+ self._messages = messages
236
+
237
+ @property
238
+ def output(self) -> ModelOutput:
239
+ """
240
+ The 'final' model output once we've completed all solving.
241
+
242
+ For simple evals this may just be the last `message` from the
243
+ conversation history, but more complex solvers may set this directly.
244
+ """
245
+ return self._output
246
+
247
+ @output.setter
248
+ def output(self, output: ModelOutput) -> None:
249
+ self._output = output
250
+
251
+ @property
252
+ def completed(self) -> bool:
253
+ """Is the task completed."""
254
+ return self._completed
255
+
256
+ @completed.setter
257
+ def completed(self, completed: bool) -> None:
258
+ """Set the completed status."""
259
+ self._completed = completed
260
+
261
+ @property
262
+ def target(self) -> str:
263
+ """The scoring target for this `Sample`."""
264
+ return self._target.text
@@ -0,0 +1 @@
1
+ from .filter import Filter, FilterEnsemble, build_filter_ensemble
@@ -0,0 +1,72 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any, Callable, Dict, Iterable, List, Union
4
+
5
+ from evalscope.api.registry import get_filter
6
+
7
+
8
+ class Filter(ABC):
9
+ """
10
+ Filter classes operate on a sample level.
11
+ """
12
+
13
+ def __init__(self, *args, **kwargs) -> None:
14
+ """
15
+ Can define custom behavior here, if an individual instantiation of a Filter class should have state.
16
+ """
17
+
18
+ @abstractmethod
19
+ def apply(self, instance: List[str]) -> List[str]:
20
+
21
+ return instance
22
+
23
+ def __call__(self, instance: str) -> str:
24
+ """
25
+ Allows the filter to be called like a function.
26
+ """
27
+ return self.apply([instance])[0]
28
+
29
+
30
+ @dataclass
31
+ class FilterEnsemble:
32
+ """
33
+ FilterEnsemble creates a pipeline applying multiple filters.
34
+ Its intended usage is to stack multiple post-processing steps in order.
35
+ """
36
+
37
+ name: str
38
+ filters: List[Callable[[], Filter]]
39
+
40
+ def apply(self, instance: List[str]) -> List[str]:
41
+
42
+ for f in self.filters:
43
+ # apply filters in sequence
44
+ instance = f.apply(instance)
45
+
46
+ return instance
47
+
48
+ def __call__(self, instance: str) -> str:
49
+ """
50
+ Allows the filter ensemble to be called like a function.
51
+ """
52
+ return self.apply([instance])[0]
53
+
54
+
55
+ def build_filter_ensemble(name: str = 'default', filters: Dict[str, Any] = {}) -> FilterEnsemble:
56
+ """
57
+ Create a filtering pipeline.
58
+ """
59
+ filter_funcs = []
60
+ for filter_name, filter_args in filters.items():
61
+ filter_cls = get_filter(filter_name)
62
+ if isinstance(filter_args, list):
63
+ filter_function = filter_cls(*filter_args)
64
+ elif isinstance(filter_args, dict):
65
+ filter_function = filter_cls(**filter_args)
66
+ else:
67
+ # Assume single value for simple filters
68
+ filter_function = filter_cls(filter_args)
69
+ # add the filter as a pipeline step
70
+ filter_funcs.append(filter_function)
71
+
72
+ return FilterEnsemble(name=name, filters=filter_funcs)
@@ -0,0 +1,11 @@
1
+ from .chat_message import (
2
+ ChatMessage,
3
+ ChatMessageAssistant,
4
+ ChatMessageSystem,
5
+ ChatMessageTool,
6
+ ChatMessageUser,
7
+ dict_to_chat_message,
8
+ messages_pretty_str,
9
+ )
10
+ from .content import Content, ContentAudio, ContentData, ContentImage, ContentReasoning, ContentText, ContentVideo
11
+ from .utils import parse_content_with_reasoning
@@ -0,0 +1,198 @@
1
+ import uuid
2
+ from pydantic import BaseModel, Field, JsonValue, model_validator
3
+ from typing import Any, Dict, List, Literal, Optional, Type, Union
4
+
5
+ from evalscope.api.tool import ToolCall, ToolCallError
6
+ from .content import Content, ContentReasoning, ContentText
7
+ from .utils import parse_content_with_reasoning
8
+
9
+
10
+ class ChatMessageBase(BaseModel):
11
+ """Base class for chat messages."""
12
+
13
+ id: Optional[str] = Field(default=None)
14
+ """Unique identifer for message."""
15
+
16
+ content: Union[str, List[Content]]
17
+ """Content (simple string or list of content objects)"""
18
+
19
+ source: Optional[Literal['input', 'generate']] = Field(default=None)
20
+ """Source of message."""
21
+
22
+ metadata: Optional[Dict[str, Any]] = Field(default=None)
23
+ """Additional message metadata."""
24
+
25
+ internal: Optional[JsonValue] = Field(default=None)
26
+ """Model provider specific payload - typically used to aid transformation back to model types."""
27
+
28
+ def model_post_init(self, __context: Any) -> None:
29
+ # Generate ID
30
+ if self.id is None:
31
+ self.id = uuid.uuid4().hex[:8] # Shorten to 8 characters for simplicity
32
+
33
+ @property
34
+ def text(self) -> str:
35
+ """Get the text content of this message.
36
+
37
+ ChatMessage content is very general and can contain either
38
+ a simple text value or a list of content parts (each of which
39
+ can either be text or an image). Solvers (e.g. for prompt
40
+ engineering) often need to interact with chat messages with
41
+ the assumption that they are a simple string. The text
42
+ property returns either the plain str content, or if the
43
+ content is a list of text and images, the text items
44
+ concatenated together (separated by newline)
45
+ """
46
+ if isinstance(self.content, str):
47
+ return self.content
48
+ else:
49
+ all_text = [content.text for content in self.content if content.type == 'text']
50
+ return '\n'.join(all_text)
51
+
52
+ @text.setter
53
+ def text(self, text: str) -> None:
54
+ """Set the primary text content for this message.
55
+
56
+ ChatMessage content is very general and can contain either
57
+ a simple text value or a list of content parts (each of which
58
+ can either be text or an image). Solvers (e.g. for prompt
59
+ engineering) often need to interact with chat messages with
60
+ the assumption that they are a simple string. The text property
61
+ sets text either to content directly (if it is a `str`) or to
62
+ the first text content item in the message (inserting one at
63
+ the beginning if necessary). If there are multiple text content
64
+ items in the message then after the set there will be only
65
+ one remaining (image content will remain).
66
+ """
67
+ if isinstance(self.content, str):
68
+ self.content = text
69
+ else:
70
+ all_other = [content for content in self.content if content.type != 'text']
71
+ self.content = all_other + [ContentText(text=text)]
72
+
73
+
74
+ class ChatMessageSystem(ChatMessageBase):
75
+ """System chat message."""
76
+
77
+ role: Literal['system'] = Field(default='system')
78
+ """Conversation role."""
79
+
80
+
81
+ class ChatMessageUser(ChatMessageBase):
82
+ """User chat message."""
83
+
84
+ role: Literal['user'] = Field(default='user')
85
+ """Conversation role."""
86
+
87
+ tool_call_id: Optional[List[str]] = Field(default=None)
88
+ """ID(s) of tool call(s) this message has the content payload for."""
89
+
90
+
91
+ class ChatMessageAssistant(ChatMessageBase):
92
+ """Assistant chat message."""
93
+
94
+ role: Literal['assistant'] = Field(default='assistant')
95
+ """Conversation role."""
96
+
97
+ tool_calls: Optional[List[ToolCall]] = Field(default=None)
98
+ """Tool calls made by the model."""
99
+
100
+ model: Optional[str] = Field(default=None)
101
+ """Model used to generate assistant message."""
102
+
103
+ # Some OpenAI compatible REST endpoints include reasoning as a field alongside
104
+ # content, however since this field doesn't exist in the OpenAI interface,
105
+ # hosting providers (so far we've seen this with Together and Groq) may
106
+ # include the reasoning in a <think></think> tag before the main response.
107
+ # We expect this pattern to be repeated elsewhere, so include this hook to
108
+ # automatically extract the reasoning content when the response is prefaced
109
+ # with a <think> block. If this ends up being an overeach we can fall back
110
+ # to each provider manually parsing out <think> using a helper function.
111
+ # The implementation isn't important here, the critical thing to establish
112
+ # is that EvalScope makes reasoning content available separately.
113
+ @model_validator(mode='before')
114
+ @classmethod
115
+ def extract_reasoning(cls, data: Any) -> Any:
116
+ if isinstance(data, dict):
117
+ # cleave apart <think> blocks
118
+ content = data.get('content', None)
119
+ if isinstance(content, str):
120
+ content_text, content_reasoning = parse_content_with_reasoning(content)
121
+ if content_reasoning:
122
+ data['content'] = [
123
+ content_reasoning,
124
+ ContentText(text=content_text),
125
+ ]
126
+ # migrate messages that has explicit 'reasoning' field
127
+ # (which was our original representation of reasoning)
128
+ reasoning = data.get('reasoning', None)
129
+ if isinstance(reasoning, str):
130
+ # ensure that content is a list
131
+ content = data.get('content', None)
132
+ if content is None:
133
+ data['content'] = []
134
+ elif isinstance(content, str):
135
+ data['content'] = [ContentText(text=content)]
136
+ elif not isinstance(content, list):
137
+ data['content'] = []
138
+ data['content'].insert(0, ContentReasoning(reasoning=reasoning))
139
+
140
+ del data['reasoning']
141
+ return data
142
+
143
+
144
+ class ChatMessageTool(ChatMessageBase):
145
+ """Tool chat message."""
146
+
147
+ role: Literal['tool'] = Field(default='tool')
148
+ """Conversation role."""
149
+
150
+ tool_call_id: Optional[str] = Field(default=None)
151
+ """ID of tool call."""
152
+
153
+ function: Optional[str] = Field(default=None)
154
+ """Name of function called."""
155
+
156
+ error: Optional[ToolCallError] = Field(default=None)
157
+ """Error which occurred during tool call."""
158
+
159
+
160
+ ChatMessage = Union[ChatMessageSystem, ChatMessageUser, ChatMessageAssistant, ChatMessageTool]
161
+ """Message in a chat conversation"""
162
+
163
+
164
+ def dict_to_chat_message(data: Dict[str, Any]) -> ChatMessage:
165
+ """Convert a dictionary to a ChatMessage."""
166
+
167
+ if isinstance(data, ChatMessage):
168
+ return data
169
+
170
+ if 'role' not in data:
171
+ raise ValueError('ChatMessage must have a "role" field')
172
+
173
+ role = data['role']
174
+ if role == 'system':
175
+ return ChatMessageSystem.model_validate(data)
176
+ elif role == 'user':
177
+ return ChatMessageUser.model_validate(data)
178
+ elif role == 'assistant':
179
+ return ChatMessageAssistant.model_validate(data)
180
+ elif role == 'tool':
181
+ return ChatMessageTool.model_validate(data)
182
+ else:
183
+ raise ValueError(f'Unknown chat message role: {role}')
184
+
185
+
186
+ def messages_pretty_str(messages: List[ChatMessage]) -> str:
187
+ """Pretty print a list of chat messages."""
188
+ output = []
189
+ for message in messages:
190
+ role = message.role.capitalize()
191
+ content = message.text
192
+ if isinstance(message, ChatMessageTool):
193
+ if message.error:
194
+ content += f'\nError: {message.error.message}'
195
+ if message.function:
196
+ content += f'\nFunction: {message.function}'
197
+ output.append(f'**{role}**: {content}')
198
+ return '\n\n'.join(output)
@@ -0,0 +1,102 @@
1
+ from pydantic import BaseModel, Field, JsonValue
2
+ from typing import Dict, Literal, Optional, Sequence, Union
3
+
4
+
5
+ class ContentBase(BaseModel):
6
+ internal: Optional[JsonValue] = Field(default=None)
7
+ """Model provider specific payload - typically used to aid transformation back to model types."""
8
+
9
+
10
+ class ContentText(ContentBase):
11
+ """Text content."""
12
+
13
+ type: Literal['text'] = Field(default='text')
14
+ """Type."""
15
+
16
+ text: str
17
+ """Text content."""
18
+
19
+ refusal: Optional[bool] = Field(default=None)
20
+ """Was this a refusal message?"""
21
+
22
+
23
+ class ContentReasoning(ContentBase):
24
+ """Reasoning content.
25
+
26
+ See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
27
+ """ # noqa: E501
28
+
29
+ type: Literal['reasoning'] = Field(default='reasoning')
30
+ """Type."""
31
+
32
+ reasoning: str
33
+ """Reasoning content."""
34
+
35
+ signature: Optional[str] = Field(default=None)
36
+ """Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)""" # noqa: E501
37
+
38
+ redacted: bool = Field(default=False)
39
+ """Indicates that the explicit content of this reasoning block has been redacted."""
40
+
41
+
42
+ class ContentImage(ContentBase):
43
+ """Image content."""
44
+
45
+ type: Literal['image'] = Field(default='image')
46
+ """Type."""
47
+
48
+ image: str
49
+ """Either a URL of the image or the base64 encoded image data."""
50
+
51
+ detail: Literal['auto', 'low', 'high'] = Field(default='auto')
52
+ """Specifies the detail level of the image.
53
+
54
+ Currently only supported for OpenAI. Learn more in the [Vision guide](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
55
+ """ # noqa: E501
56
+
57
+
58
+ class ContentAudio(ContentBase):
59
+ """Audio content."""
60
+
61
+ type: Literal['audio'] = Field(default='audio')
62
+ """Type."""
63
+
64
+ audio: str
65
+ """Audio file path or base64 encoded data URL."""
66
+
67
+ format: Literal['wav', 'mp3']
68
+ """Format of audio data ('mp3' or 'wav')"""
69
+
70
+
71
+ class ContentVideo(ContentBase):
72
+ """Video content."""
73
+
74
+ type: Literal['video'] = Field(default='video')
75
+ """Type."""
76
+
77
+ video: str
78
+ """Audio file path or base64 encoded data URL."""
79
+
80
+ format: Literal['mp4', 'mpeg', 'mov']
81
+ """Format of video data ('mp4', 'mpeg', or 'mov')"""
82
+
83
+
84
+ class ContentData(ContentBase):
85
+ """Model internal."""
86
+
87
+ type: Literal['data'] = Field(default='data')
88
+ """Type."""
89
+
90
+ data: Dict[str, JsonValue]
91
+ """Model provider specific payload - required for internal content."""
92
+
93
+
94
+ Content = Union[
95
+ ContentText,
96
+ ContentReasoning,
97
+ ContentImage,
98
+ ContentAudio,
99
+ ContentVideo,
100
+ ContentData,
101
+ ]
102
+ """Content sent to or received from a model."""
@@ -0,0 +1,35 @@
1
+ import re
2
+ from typing import Optional
3
+
4
+ from .content import ContentReasoning
5
+
6
+
7
+ def parse_content_with_reasoning(content: str) -> tuple[str, Optional[ContentReasoning]]:
8
+ """
9
+ Looks for and extracts <think/> tags into reasoning text.
10
+
11
+ Returns a tuple:
12
+ - The first element is the input content with the <think> tag and its contents fully removed.
13
+ - The second element is a ContentReasoning object (or None if no <think> tag is found).
14
+ """
15
+ # Match <think> tag with optional attributes anywhere in the string
16
+ pattern = (r'<think(?:\s+signature="([^"]*)")?(?:\s+redacted="(true)")?\s*>(.*?)</think>')
17
+ match = re.search(pattern, content, re.DOTALL)
18
+
19
+ if match:
20
+ signature = match.group(1) # This will be None if not present
21
+ redacted_value = match.group(2) # This will be "true" or None
22
+ reasoning = match.group(3).strip()
23
+ # Remove the matched <think>...</think> from the input
24
+ start, end = match.span()
25
+
26
+ return (
27
+ (content[:start] + content[end:]).strip(),
28
+ ContentReasoning(
29
+ reasoning=reasoning,
30
+ signature=signature,
31
+ redacted=redacted_value == 'true',
32
+ ),
33
+ )
34
+ else:
35
+ return content, None
@@ -0,0 +1,2 @@
1
+ from .metric import Metric, T2IMetric
2
+ from .scorer import Aggregator, AggScore, SampleScore, Score, Value