evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
1
+ import json
2
+ from tqdm import tqdm
3
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast
4
+
5
+ from .dataset import Dataset, FieldSpec, Sample
6
+
7
+
8
+ def record_to_sample_fn(sample_fields: Union[FieldSpec, Callable, None] = None, ) -> Callable:
9
+ if sample_fields is None:
10
+ sample_fields = FieldSpec()
11
+
12
+ if isinstance(sample_fields, FieldSpec):
13
+
14
+ def record_to_sample(record: dict) -> Sample:
15
+ # collect metadata if specified
16
+ metadata: Optional[Dict[str, Any]] = None
17
+ if sample_fields.metadata:
18
+ if isinstance(sample_fields.metadata, list):
19
+ metadata = {}
20
+ for name in sample_fields.metadata:
21
+ metadata[name] = record.get(name)
22
+
23
+ elif 'metadata' in record:
24
+ metadata_field = record.get('metadata')
25
+ if isinstance(metadata_field, str):
26
+ metadata = json.loads(metadata_field)
27
+ elif isinstance(metadata_field, dict):
28
+ metadata = metadata_field
29
+ else:
30
+ raise ValueError(f"Unexpected type for 'metadata' field: {type(metadata_field)}")
31
+
32
+ # return sample
33
+ return Sample(
34
+ input=read_input(record.get(sample_fields.input)),
35
+ target=read_target(record.get(sample_fields.target)),
36
+ choices=read_choices(record.get(sample_fields.choices)),
37
+ id=record.get(sample_fields.id, None),
38
+ metadata=metadata,
39
+ sandbox=read_sandbox(record.get(sample_fields.sandbox)),
40
+ files=read_files(record.get(sample_fields.files)),
41
+ setup=read_setup(record.get(sample_fields.setup)),
42
+ )
43
+
44
+ return record_to_sample
45
+
46
+ else:
47
+ return sample_fields
48
+
49
+
50
+ def data_to_samples(data: Iterable[dict], data_to_sample: Callable) -> List[Sample]:
51
+ samples: List[Sample] = []
52
+ for record in tqdm(data, desc='Processing records'):
53
+ record_samples = as_sample_list(data_to_sample(record=record))
54
+ samples.extend(record_samples)
55
+ return samples
56
+
57
+
58
+ def as_sample_list(samples: Union[Sample, List[Sample]]) -> List[Sample]:
59
+ if isinstance(samples, list):
60
+ return samples
61
+ else:
62
+ return [samples]
63
+
64
+
65
+ def read_input(input_val: Optional[Any]) -> str:
66
+ if not input_val:
67
+ raise ValueError('No input in dataset')
68
+ return str(input_val)
69
+
70
+
71
+ def read_target(obj: Optional[Any]) -> Union[str, List[str]]:
72
+ if obj is not None:
73
+ return [str(item) for item in obj] if isinstance(obj, list) else str(obj)
74
+ else:
75
+ return ''
76
+
77
+
78
+ def read_choices(obj: Optional[Any]) -> Optional[List[str]]:
79
+ if obj is not None:
80
+ if isinstance(obj, list):
81
+ return [str(choice) for choice in obj]
82
+ elif isinstance(obj, str):
83
+ choices = obj.split(',')
84
+ if len(choices) == 1:
85
+ choices = obj.split()
86
+ return [choice.strip() for choice in choices]
87
+ else:
88
+ return [str(obj)]
89
+ else:
90
+ return None
91
+
92
+
93
+ def read_setup(setup: Optional[Any]) -> Optional[str]:
94
+ if setup is not None:
95
+ return str(setup)
96
+ else:
97
+ return None
98
+
99
+
100
+ def read_sandbox(sandbox: Optional[Any]) -> Optional[str]:
101
+ if sandbox is not None:
102
+ if isinstance(sandbox, str):
103
+ return sandbox
104
+ elif isinstance(sandbox, dict):
105
+ return json.dumps(sandbox)
106
+ else:
107
+ raise ValueError(f"Unexpected type for 'sandbox' field: {type(sandbox)}")
108
+ else:
109
+ return None
110
+
111
+
112
+ def read_files(files: Optional[Any]) -> Optional[Dict[str, str]]:
113
+ if files is not None:
114
+ if isinstance(files, str):
115
+ files = json.loads(files)
116
+ if isinstance(files, dict):
117
+ if all(isinstance(v, str) for v in files.values()):
118
+ return cast(Dict[str, str], files)
119
+
120
+ # didn't find the right type
121
+ raise ValueError(f"Unexpected type for 'files' field: {type(files)}")
122
+ else:
123
+ return None
124
+
125
+
126
+ def shuffle_choices_if_requested(dataset: Dataset, shuffle_choices: Optional[Union[bool, int]]) -> None:
127
+ """
128
+ Shuffle the choices in the dataset if requested.
129
+
130
+ The `shuffle_choices` parameter passed to `json_dataset`, `csv_dataset`,
131
+ and `hf_dataset` can be a boolean, an integer, or `None` (default).
132
+ If it is a boolean, it will shuffle the choices if the value is `True`,
133
+ and do nothing if it is `False`.
134
+ If it is an integer, it will shuffle the choices using the integer as the seed.
135
+ """
136
+ # Note that `isinstance(x, int)` returns True if x is True or False,
137
+ # so we need to check for both explicitly
138
+ if shuffle_choices is True:
139
+ dataset.shuffle_choices()
140
+ elif shuffle_choices is False:
141
+ pass
142
+ elif isinstance(shuffle_choices, int):
143
+ dataset.shuffle_choices(seed=shuffle_choices)
@@ -0,0 +1,3 @@
1
+ from .cache import CacheManager, ModelResult, ReviewResult
2
+ from .evaluator import Evaluator
3
+ from .state import Choices, Target, TaskState
@@ -0,0 +1,355 @@
1
+ import copy
2
+ import os
3
+ from pydantic import BaseModel
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
6
+ from evalscope.api.dataset import Dataset
7
+ from evalscope.api.messages import ChatMessage
8
+ from evalscope.api.metric import SampleScore
9
+ from evalscope.api.model import ModelOutput
10
+ from evalscope.constants import DumpMode
11
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
12
+ from evalscope.utils.logger import get_logger
13
+ from .state import TaskState
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ class CacheManager:
19
+ """
20
+ Manage model results and review results for evaluation caching.
21
+
22
+ This class handles the caching mechanism for evaluation results, allowing
23
+ the system to resume evaluations from previously computed results and
24
+ avoid redundant computations.
25
+ """
26
+
27
+ def __init__(self, outputs: OutputsStructure, model_name: str, benchmark_name: str):
28
+ """
29
+ Initialize the cache manager.
30
+
31
+ Args:
32
+ outputs: Output directory structure for storing cache files
33
+ model_name: Name of the model being evaluated
34
+ benchmark_name: Name of the benchmark being used
35
+ """
36
+ self.outputs = outputs
37
+ self.model_name = model_name
38
+ self.benchmark_name = benchmark_name
39
+
40
+ def filter_prediction_cache(self, subset: str, dataset: Dataset) -> Tuple[List[TaskState], Dataset]:
41
+ """
42
+ Load cached prediction results and filter them from the dataset.
43
+
44
+ This method checks for existing prediction cache files and loads any
45
+ previously computed results. It then filters these samples from the
46
+ input dataset to avoid recomputation.
47
+
48
+ Args:
49
+ subset: Name of the dataset subset
50
+ dataset: The dataset to filter
51
+
52
+ Returns:
53
+ Tuple of (cached task states, filtered dataset with remaining samples)
54
+ """
55
+ cache_file = self.get_prediction_cache_path(subset)
56
+ if not os.path.exists(cache_file):
57
+ # No cache file exists, return empty cache and full dataset
58
+ return [], dataset
59
+
60
+ cached_task_states = []
61
+ cached_sample_ids = set()
62
+ cache_items = jsonl_to_list(cache_file)
63
+
64
+ # Process each cached item
65
+ for cache_item in cache_items:
66
+ # Deserialize the cached model result
67
+ cached_model_result = ModelResult.model_validate(cache_item)
68
+ # Convert to task state for further processing
69
+ cached_state = cached_model_result.to_task_state(dataset=dataset)
70
+
71
+ cached_task_states.append(cached_state)
72
+ cached_sample_ids.add(cached_state.sample_id)
73
+
74
+ # Remove cached samples from the dataset to avoid reprocessing
75
+ filtered_dataset = dataset.filter(lambda sample: sample.id not in cached_sample_ids)
76
+
77
+ logger.info(
78
+ f'Reusing predictions from {cache_file}, got {len(cached_task_states)} predictions, '
79
+ f'remaining {len(filtered_dataset)} samples'
80
+ )
81
+ return cached_task_states, filtered_dataset
82
+
83
+ def get_prediction_cache_path(self, subset: str) -> str:
84
+ """
85
+ Get the file path for prediction cache storage.
86
+
87
+ Args:
88
+ subset: Name of the dataset subset
89
+
90
+ Returns:
91
+ Path to the prediction cache file
92
+ """
93
+ file_path = os.path.join(self.outputs.predictions_dir, self.model_name, f'{self.benchmark_name}_{subset}.jsonl')
94
+ # Ensure the directory exists
95
+ if self.outputs.is_make:
96
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
97
+ return file_path
98
+
99
+ def save_prediction_cache(self, subset: str, task_state: TaskState, save_metadata: bool = True) -> 'ModelResult':
100
+ """
101
+ Save a prediction result to the cache.
102
+
103
+ Args:
104
+ subset: Name of the dataset subset
105
+ task_state: The task state containing prediction results
106
+
107
+ Returns:
108
+ The saved model result object
109
+ """
110
+ cache_file = self.get_prediction_cache_path(subset)
111
+ # Convert task state to serializable model result
112
+ model_result = ModelResult.from_task_state(task_state, save_metadata)
113
+ # Serialize to dictionary
114
+ model_result_dict = model_result.model_dump()
115
+ # Append to JSONL cache file
116
+ dump_jsonl_data(data_list=model_result_dict, jsonl_file=cache_file, dump_mode=DumpMode.APPEND)
117
+ return model_result
118
+
119
+ def filter_review_cache(self, subset: str,
120
+ task_states: List[TaskState]) -> Tuple[List[SampleScore], List[TaskState]]:
121
+ """
122
+ Load cached review results and filter corresponding task states.
123
+
124
+ This method loads previously computed review scores and removes
125
+ the corresponding task states from further review processing.
126
+
127
+ Args:
128
+ subset: Name of the dataset subset
129
+ task_states: List of task states to potentially review
130
+
131
+ Returns:
132
+ Tuple of (cached sample scores, filtered task states for remaining reviews)
133
+ """
134
+ cache_file = self.get_review_cache_path(subset)
135
+ if not os.path.exists(cache_file):
136
+ # No review cache exists, return empty scores and all task states
137
+ return [], task_states
138
+
139
+ cached_sample_scores: List[SampleScore] = []
140
+ cache_items = jsonl_to_list(cache_file)
141
+
142
+ # Process each cached review result
143
+ for cache_item in cache_items:
144
+ # Deserialize the cached review result
145
+ cached_review_result = ReviewResult.model_validate(cache_item)
146
+ cached_sample_scores.append(cached_review_result.to_sample_score())
147
+
148
+ # Filter out task states that already have review scores
149
+ cached_sample_ids = {review.sample_id for review in cached_sample_scores}
150
+ filtered_task_states = [state for state in task_states if state.sample_id not in cached_sample_ids]
151
+
152
+ logger.info(f'Reusing reviews from {cache_file}, got {len(cached_sample_scores)} reviews')
153
+ return cached_sample_scores, filtered_task_states
154
+
155
+ def get_review_cache_path(self, subset: str) -> str:
156
+ """
157
+ Get the file path for review cache storage.
158
+
159
+ Args:
160
+ subset: Name of the dataset subset
161
+
162
+ Returns:
163
+ Path to the review cache file
164
+ """
165
+ file_path = os.path.join(self.outputs.reviews_dir, self.model_name, f'{self.benchmark_name}_{subset}.jsonl')
166
+ # Ensure the directory exists
167
+ if self.outputs.is_make:
168
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
169
+ return file_path
170
+
171
+ def delete_review_cache(self, subset: str):
172
+ """Delete the review cache for a specific subset. If the cache exists, it will be removed."""
173
+ file_path = self.get_review_cache_path(subset)
174
+ if os.path.exists(file_path):
175
+ logger.info(f'Deleting review cache file: {file_path}')
176
+ os.remove(file_path)
177
+
178
+ def save_review_cache(
179
+ self,
180
+ subset: str,
181
+ task_state: TaskState,
182
+ sample_score: SampleScore,
183
+ save_metadata: bool = True
184
+ ) -> 'ReviewResult':
185
+ """
186
+ Save a review result to the cache.
187
+
188
+ Args:
189
+ subset: Name of the dataset subset
190
+ task_state: The task state that was reviewed
191
+ sample_score: The computed score for the sample
192
+
193
+ Returns:
194
+ The saved review result object
195
+ """
196
+ cache_file = self.get_review_cache_path(subset)
197
+ # Convert score and state to serializable review result
198
+ review_result = ReviewResult.from_score_state(sample_score, task_state, save_metadata)
199
+ # Serialize to dictionary
200
+ review_result_dict = review_result.model_dump()
201
+ # Append to JSONL cache file
202
+ dump_jsonl_data(data_list=review_result_dict, jsonl_file=cache_file, dump_mode=DumpMode.APPEND)
203
+ return review_result
204
+
205
+ def get_report_path(self) -> str:
206
+ """
207
+ Get the directory path for report storage.
208
+
209
+ Returns:
210
+ Path to the reports directory for this model
211
+ """
212
+ report_path = os.path.join(self.outputs.reports_dir, self.model_name)
213
+ # Ensure the directory exists
214
+ if self.outputs.is_make:
215
+ os.makedirs(report_path, exist_ok=True)
216
+ return report_path
217
+
218
+ def get_report_file(self) -> str:
219
+ """
220
+ Get the report file path for the benchmark.
221
+
222
+ The report file is named as '{benchmark_name}.json' and contains
223
+ the final evaluation results for the benchmark.
224
+
225
+ Returns:
226
+ Full path to the benchmark report file
227
+ """
228
+ return os.path.join(self.get_report_path(), f'{self.benchmark_name}.json')
229
+
230
+
231
+ class ModelResult(BaseModel):
232
+ """
233
+ Serializable container for model prediction results.
234
+
235
+ This class represents a single model prediction that can be cached
236
+ and restored later to avoid recomputation.
237
+ """
238
+
239
+ index: int
240
+ """Index of the sample in the dataset that was processed."""
241
+
242
+ model: str = ''
243
+ """Name of the model that generated this prediction."""
244
+
245
+ model_output: Optional[ModelOutput] = None
246
+ """The actual prediction/output generated by the model."""
247
+
248
+ messages: List[ChatMessage] = []
249
+ """Chat messages exchanged during evaluation (for conversational models)."""
250
+
251
+ metadata: Optional[Dict[str, Any]] = None
252
+ """Additional metadata associated with the model result."""
253
+
254
+ @classmethod
255
+ def from_task_state(cls, task_state: TaskState, save_metadata: bool = True) -> 'ModelResult':
256
+ """
257
+ Create a ModelResult from a TaskState for caching.
258
+
259
+ Args:
260
+ task_state: The completed task state to serialize
261
+
262
+ Returns:
263
+ ModelResult object ready for caching
264
+ """
265
+ return cls(
266
+ model=task_state.model,
267
+ index=task_state.sample_id,
268
+ messages=task_state.messages,
269
+ model_output=task_state.output,
270
+ metadata=task_state.metadata if save_metadata else {},
271
+ )
272
+
273
+ def to_task_state(self, dataset: Dataset) -> TaskState:
274
+ """
275
+ Restore a TaskState from cached ModelResult.
276
+
277
+ Args:
278
+ dataset: The dataset to retrieve the original sample from
279
+
280
+ Returns:
281
+ Reconstructed TaskState with cached results
282
+
283
+ Raises:
284
+ ValueError: If the sample index is not found in the dataset
285
+ """
286
+ sample = dataset[self.index]
287
+ if not sample:
288
+ raise ValueError(f'Sample with index {self.index} not found in dataset')
289
+
290
+ # update metadata if exists
291
+ if self.metadata:
292
+ sample.metadata.update(self.metadata)
293
+
294
+ return TaskState(
295
+ model=self.model,
296
+ sample=sample,
297
+ messages=self.messages,
298
+ output=ModelOutput.model_validate(self.model_output),
299
+ completed=True, # Mark as completed since it was cached
300
+ )
301
+
302
+
303
+ class ReviewResult(BaseModel):
304
+ """
305
+ Serializable container for review/scoring results.
306
+
307
+ This class represents the result of reviewing a model's prediction,
308
+ including the computed score and relevant context.
309
+ """
310
+
311
+ index: int
312
+ """Index of the sample that was reviewed."""
313
+
314
+ input: str = ''
315
+ """Original input from the sample (immutable reference)."""
316
+
317
+ target: Optional[str] = None
318
+ """Expected/target answer for the sample, if available."""
319
+
320
+ sample_score: SampleScore
321
+ """The computed evaluation score for this sample."""
322
+
323
+ @classmethod
324
+ def from_score_state(
325
+ cls, sample_score: SampleScore, state: TaskState, save_metadata: bool = True
326
+ ) -> 'ReviewResult':
327
+ """
328
+ Create a ReviewResult from a score and task state for caching.
329
+
330
+ Args:
331
+ sample_score: The computed score for the sample
332
+ state: The task state containing sample information
333
+
334
+ Returns:
335
+ ReviewResult object ready for caching
336
+ """
337
+ if not save_metadata:
338
+ sample_score = copy.deepcopy(sample_score)
339
+ sample_score.sample_metadata = None
340
+
341
+ return cls(
342
+ index=state.sample_id,
343
+ input=state.input_text,
344
+ target=state.target,
345
+ sample_score=sample_score,
346
+ )
347
+
348
+ def to_sample_score(self) -> SampleScore:
349
+ """
350
+ Extract the sample score from the cached review result.
351
+
352
+ Returns:
353
+ The sample score object
354
+ """
355
+ return self.sample_score
@@ -0,0 +1,56 @@
1
+ import abc
2
+ from typing import TYPE_CHECKING, List, Union
3
+
4
+ from evalscope.api.metric import SampleScore
5
+ from evalscope.report import Report
6
+ from .state import TaskState
7
+
8
+ if TYPE_CHECKING:
9
+ from evalscope.api.benchmark import DataAdapter
10
+ from evalscope.api.model import Model
11
+ from evalscope.config import TaskConfig
12
+ from evalscope.utils.io_utils import OutputsStructure
13
+
14
+
15
+ class Evaluator(abc.ABC):
16
+ """
17
+ Abstract base class for evaluators.
18
+
19
+ Args:
20
+ benchmark (DataAdapter): The data adapter for the benchmark.
21
+ model (Model): The model to evaluate.
22
+ outputs (OutputsStructure, optional): The output structure for results.
23
+ task_config (TaskConfig, optional): The task configuration.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ benchmark: 'DataAdapter',
29
+ model: 'Model',
30
+ outputs: 'OutputsStructure' = None,
31
+ task_config: 'TaskConfig' = None,
32
+ ):
33
+ self.benchmark = benchmark
34
+ self.model = model
35
+ self.outputs = outputs
36
+ self.task_config = task_config
37
+
38
+ @abc.abstractmethod
39
+ def eval(self, *args, **kwargs) -> Report:
40
+ """Run the evaluation process."""
41
+ pass
42
+
43
+ @abc.abstractmethod
44
+ def get_answers(self, *args, **kwargs) -> List[TaskState]:
45
+ """Get the evaluation answers."""
46
+ pass
47
+
48
+ @abc.abstractmethod
49
+ def get_reviews(self, *args, **kwargs) -> List[SampleScore]:
50
+ """Get the review results."""
51
+ pass
52
+
53
+ @abc.abstractmethod
54
+ def get_report(self, *args, **kwargs) -> Report:
55
+ """Get the evaluation report."""
56
+ pass