evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,349 @@
1
+ import abc
2
+ import random
3
+ from collections import defaultdict
4
+ from dataclasses import dataclass, field
5
+ from pydantic import BaseModel, Field
6
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
7
+
8
+ from evalscope.api.messages import ChatMessage, messages_pretty_str
9
+ from evalscope.api.tool import ToolInfo
10
+ from evalscope.utils.multi_choices import answer_character, answer_index
11
+
12
+
13
+ class Sample(BaseModel):
14
+ r"""Sample for an evaluation task."""
15
+
16
+ input: Union[str, List[ChatMessage]]
17
+ """The input to be submitted to the model."""
18
+
19
+ choices: Optional[List[str]] = None
20
+ """List of available answer choices (used only for multiple-choice evals)."""
21
+
22
+ target: Union[str, List[str]] = ''
23
+ """Ideal target output. May be a literal value or narrative text to be used by a model grader."""
24
+
25
+ id: Optional[int] = None
26
+ """Unique identifier for sample."""
27
+
28
+ group_id: Optional[int] = None
29
+ """Identifier for the group this sample belongs to, used for grouping k repeated samples."""
30
+
31
+ tools: Optional[List[ToolInfo]] = None
32
+ """List of tools available to the model during inference (optional)."""
33
+
34
+ category: Optional[str] = None
35
+ """Category of the sample (optional)."""
36
+
37
+ subset_key: Optional[str] = None
38
+ """Key for the subset this sample belongs to, used for generating subsets (optional)."""
39
+
40
+ metadata: Dict[str, Any] = Field(default_factory=dict)
41
+ """Arbitrary metadata associated with the sample."""
42
+
43
+ sandbox: Optional[str] = None
44
+ """Sandbox environment type and optional config file."""
45
+
46
+ files: Optional[Dict[str, str]] = None
47
+ """Files that go along with the sample (copied to SandboxEnvironment)"""
48
+
49
+ setup: Optional[str] = None
50
+ """Setup script to run for sample (run within default SandboxEnvironment)."""
51
+
52
+ def pretty_print(self) -> str:
53
+ """Return a pretty-printed string representation of the sample."""
54
+ if isinstance(self.input, str):
55
+ input_text = self.input
56
+ else:
57
+ input_text = messages_pretty_str(self.input)
58
+ return f'Sample ID: {self.id}\nInput: {input_text}\nTarget: {self.target}'
59
+
60
+
61
+ @dataclass
62
+ class FieldSpec:
63
+ r"""Specification for mapping data source fields to sample fields."""
64
+
65
+ input: str = field(default='input')
66
+ """Name of the field containing the sample input."""
67
+
68
+ target: str = field(default='target')
69
+ """Name of the field containing the sample target."""
70
+
71
+ choices: str = field(default='choices')
72
+ """Name of field containing the list of answer choices."""
73
+
74
+ id: int = field(default=0)
75
+ """ Unique identifier for the sample."""
76
+
77
+ metadata: Optional[List[str]] = field(default=None)
78
+ """List of additional field names that should be read as metadata."""
79
+
80
+ sandbox: str = field(default='sandbox')
81
+ """Sandbox type along with optional config file."""
82
+
83
+ files: str = field(default='files')
84
+ """Files that go along with the sample."""
85
+
86
+ setup: str = field(default='setup')
87
+ """Setup script to run for sample (run within default SandboxEnvironment)."""
88
+
89
+
90
+ class Dataset(Sequence[Sample], abc.ABC):
91
+ r"""A sequence of Sample objects.
92
+
93
+ Datasets provide sequential access (via conventional indexes or slicing)
94
+ to a collection of Sample objects.
95
+ """
96
+
97
+ @property
98
+ @abc.abstractmethod
99
+ def name(self) -> Optional[str]:
100
+ ...
101
+
102
+ @property
103
+ @abc.abstractmethod
104
+ def location(self) -> Optional[str]:
105
+ ...
106
+
107
+ @property
108
+ @abc.abstractmethod
109
+ def shuffled(self) -> bool:
110
+ ...
111
+
112
+ @abc.abstractmethod
113
+ def __iter__(self) -> Iterator[Sample]:
114
+ """Return an iterator over the samples."""
115
+ ...
116
+
117
+ @abc.abstractmethod
118
+ def __getitem__(self, index: Union[int, slice]) -> Union[Sample, 'Dataset']:
119
+ ...
120
+
121
+ @abc.abstractmethod
122
+ def __len__(self) -> int:
123
+ ...
124
+
125
+ @abc.abstractmethod
126
+ def filter(self, predicate: Callable[[Sample], bool], name: Optional[str] = None) -> 'Dataset':
127
+ """Filter the dataset using a predicate. Only samples matching the predicate will be included.
128
+
129
+ Args:
130
+ predicate: Filtering function.
131
+ name: Name for filtered dataset (optional).
132
+
133
+ Returns:
134
+ Filtered dataset.
135
+ """
136
+ ...
137
+
138
+ @abc.abstractmethod
139
+ def shuffle(self, seed: Optional[int] = None) -> None:
140
+ """Shuffle the order of the dataset (in place).
141
+
142
+ Args:
143
+ seed: Random seed for shuffling (optional).
144
+ """
145
+ ...
146
+
147
+ @abc.abstractmethod
148
+ def shuffle_choices(self, seed: Optional[int] = None) -> None:
149
+ """Shuffle the order of the choices with each sample.
150
+
151
+ Args:
152
+ seed: Random seed for shuffling (optional).
153
+ """
154
+ ...
155
+
156
+ @abc.abstractmethod
157
+ def reindex(self, group_size=1):
158
+ """Reindex the dataset samples to ensure consistent ordering.
159
+
160
+ Args:
161
+ group_size: Number of samples per group for setting group_id.
162
+ """
163
+ ...
164
+
165
+
166
+ class MemoryDataset(Dataset):
167
+ r"""A Dataset stored in memory."""
168
+
169
+ def __init__(
170
+ self,
171
+ samples: List[Sample],
172
+ name: Optional[str] = None,
173
+ location: Optional[str] = None,
174
+ shuffled: bool = False,
175
+ ) -> None:
176
+ r"""A dataset of samples held in an in-memory list.
177
+
178
+ Datasets provide sequential access (via conventional indexes or slicing)
179
+ to a collection of Sample objects. The ListDataset is explicitly
180
+ initialized with a list that is held in memory.
181
+
182
+ Args:
183
+ samples (List[Sample]): The list of sample objects.
184
+ name (str | None): Optional name for dataset.
185
+ location (str | None): Optional location for dataset.
186
+ shuffled (bool): Was the dataset shuffled after reading.
187
+ """
188
+ self.samples = samples
189
+ self._name = name
190
+ self._location = location
191
+ self._shuffled = shuffled
192
+
193
+ @property
194
+ def name(self) -> Optional[str]:
195
+ """Dataset name."""
196
+ return self._name
197
+
198
+ @property
199
+ def location(self) -> Optional[str]:
200
+ """Dataset location."""
201
+ return self._location
202
+
203
+ @property
204
+ def shuffled(self) -> bool:
205
+ """Was the dataset shuffled."""
206
+ return self._shuffled
207
+
208
+ def __iter__(self) -> Iterator[Sample]:
209
+ return iter(self.samples)
210
+
211
+ def __getitem__(self, index: Union[int, slice]) -> Union[Sample, Dataset]:
212
+ if isinstance(index, int):
213
+ return self.samples[index]
214
+ else:
215
+ return MemoryDataset(
216
+ samples=self.samples[index],
217
+ name=self.name,
218
+ location=self.location,
219
+ shuffled=self.shuffled,
220
+ )
221
+
222
+ def __len__(self) -> int:
223
+ return len(self.samples)
224
+
225
+ def shuffle(self, seed: Optional[int] = None) -> None:
226
+ if seed is not None:
227
+ random.Random(seed).shuffle(self.samples)
228
+ else:
229
+ random.shuffle(self.samples)
230
+ self._shuffled = True
231
+
232
+ def shuffle_choices(self, seed: Optional[int] = None) -> None:
233
+ rand = random.Random(seed)
234
+ for sample in self.samples:
235
+ if not sample.choices:
236
+ continue
237
+ # The original positions
238
+ positions = list(range(len(sample.choices)))
239
+
240
+ # Shuffle the choices
241
+ rand.shuffle(positions)
242
+ shuffled_choices = [sample.choices[i] for i in positions]
243
+
244
+ # Map of original position / target letter
245
+ position_map = {i: answer_character(new_i) for new_i, i in enumerate(positions)}
246
+
247
+ # Update to the shuffled choices and target
248
+ sample.choices = shuffled_choices
249
+ sample.target = self._remap_target(sample.target, position_map=position_map)
250
+
251
+ def _remap_target(self, target: Union[str, List[str]], position_map: Dict[int, str]) -> Union[str, List[str]]:
252
+ if isinstance(target, list):
253
+ return [position_map[answer_index(t)] for t in target]
254
+ else:
255
+ return position_map[answer_index(target)]
256
+
257
+ def filter(self, predicate: Callable[[Sample], bool], name: Optional[str] = None) -> 'MemoryDataset':
258
+ return MemoryDataset(
259
+ name=name or self.name,
260
+ location=self.location,
261
+ samples=[sample for sample in self.samples if predicate(sample)],
262
+ shuffled=self.shuffled,
263
+ )
264
+
265
+ def reindex(self, group_size=1):
266
+ # Reindex the dataset samples to ensure consistent ordering
267
+ for i, sample in enumerate(self.samples):
268
+ sample.id = i
269
+ sample.group_id = i // group_size
270
+
271
+
272
+ class DatasetDict:
273
+ """
274
+ A dictionary-like container for datasets.
275
+ """
276
+
277
+ def __init__(self, datasets: Dict[str, Dataset]):
278
+ self.datasets = datasets
279
+
280
+ def __getitem__(self, key: str) -> Dataset:
281
+ return self.datasets[key]
282
+
283
+ def __setitem__(self, key: str, value: Dataset) -> None:
284
+ self.datasets[key] = value
285
+
286
+ def __delitem__(self, key: str) -> None:
287
+ del self.datasets[key]
288
+
289
+ def get(self, key: str, default: Optional[Dataset] = None) -> Optional[Dataset]:
290
+ return self.datasets.get(key, default)
291
+
292
+ def items(self):
293
+ return self.datasets.items()
294
+
295
+ def keys(self):
296
+ return self.datasets.keys()
297
+
298
+ def values(self):
299
+ return self.datasets.values()
300
+
301
+ def __len__(self) -> int:
302
+ return len(self.datasets)
303
+
304
+ @classmethod
305
+ def from_dataset(
306
+ cls,
307
+ dataset: Dataset,
308
+ subset_list: List[str],
309
+ limit: Optional[Union[int, float]] = None,
310
+ repeats: int = 1
311
+ ) -> 'DatasetDict':
312
+ """
313
+ Create a DatasetDict from a single Dataset using subset key in the sample.
314
+
315
+ Args:
316
+ dataset (Dataset): The dataset to wrap in a DatasetDict.
317
+ subset_list (List[str]): List of subset keys to include.
318
+ limit (int | float | None): Optional limit on number of samples per subset.
319
+ If int, limits to that many samples. If float, limits to that fraction of samples.
320
+
321
+ Returns:
322
+ DatasetDict: A new DatasetDict containing the provided dataset.
323
+ """
324
+ data_dict = defaultdict(list)
325
+ dataset_dict = defaultdict(list)
326
+ # init subset keys to prevent order issues
327
+ for key in subset_list:
328
+ data_dict[key] = []
329
+ dataset_dict[key] = []
330
+
331
+ # Loop through each sample in the dataset
332
+ for sample in dataset.samples:
333
+ subset_key = sample.subset_key or 'default'
334
+ data_dict[subset_key].append(sample)
335
+ # Create a MemoryDataset for each subset key
336
+ for key, samples in data_dict.items():
337
+ if key not in subset_list:
338
+ continue
339
+ # Apply limit if specified
340
+ if limit is not None:
341
+ if isinstance(limit, float):
342
+ limit = int(len(samples) * limit)
343
+ total_limit = limit * repeats
344
+ samples = samples[:total_limit]
345
+ cur_dataset = MemoryDataset(samples, name=dataset.name)
346
+ # Reindex the dataset to ensure consistent IDs and group IDs
347
+ cur_dataset.reindex(group_size=repeats)
348
+ dataset_dict[key] = cur_dataset
349
+ return cls(dataset_dict)
@@ -0,0 +1,261 @@
1
+ import copy
2
+ import os
3
+ import random
4
+ from abc import ABC, abstractmethod
5
+ from pathlib import Path
6
+ from typing import Callable, Dict, List, Optional, Union
7
+
8
+ from evalscope.api.dataset.utils import record_to_sample_fn
9
+ from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, HubType
10
+ from evalscope.utils import get_logger
11
+ from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename
12
+ from .dataset import Dataset, FieldSpec, MemoryDataset, Sample
13
+ from .utils import data_to_samples, shuffle_choices_if_requested
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ class DataLoader(ABC):
19
+ """
20
+ Abstract base class for data loaders.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ data_id_or_path: str,
26
+ split: str,
27
+ sample_fields: Union[FieldSpec, Callable] = None,
28
+ filter_func: Callable = None,
29
+ subset: str = 'default',
30
+ version: str = None,
31
+ limit: Union[int, float] = None,
32
+ data_source: Optional[str] = None,
33
+ shuffle: bool = False,
34
+ shuffle_choices: Optional[Union[bool, int]] = None,
35
+ seed: Optional[int] = None,
36
+ auto_id: bool = True,
37
+ repeats: int = 1,
38
+ trust_remote: bool = True,
39
+ **kwargs
40
+ ):
41
+ self.data_id_or_path = data_id_or_path
42
+ self.split = split
43
+ self.sample_fields = sample_fields
44
+ self.filter_func = filter_func
45
+ self.subset = subset
46
+ self.version = version
47
+ self.limit = limit
48
+ self.data_source = data_source
49
+ self.shuffle = shuffle
50
+ self.shuffle_choices = shuffle_choices
51
+ self.seed = seed
52
+ self.auto_id = auto_id
53
+ self.repeats = repeats
54
+ self.trust_remote = trust_remote
55
+ self.kwargs = kwargs
56
+
57
+ @abstractmethod
58
+ def load(self) -> Dataset:
59
+ """
60
+ Load data from the source.
61
+ """
62
+ ...
63
+
64
+
65
+ class RemoteDataLoader(DataLoader):
66
+ """
67
+ Data loader for remote datasets: ModelScope or Huggingface.
68
+ """
69
+
70
+ def load(self) -> Dataset:
71
+ import datasets
72
+ from modelscope import MsDataset
73
+
74
+ path = self.data_id_or_path
75
+ # resolve data_to_sample function
76
+ data_to_sample = record_to_sample_fn(self.sample_fields)
77
+ # generate a unique cache dir for this dataset
78
+ dataset_hash = gen_hash(f'{path}{self.split}{self.subset}{self.version}{self.kwargs}')
79
+ datasets_cache_dir = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'datasets')
80
+ dataset_cache_dir = os.path.join(datasets_cache_dir, f'{safe_filename(path)}-{dataset_hash}')
81
+ if os.path.exists(dataset_cache_dir):
82
+ dataset = datasets.load_from_disk(dataset_cache_dir)
83
+ else:
84
+ logger.info(
85
+ f'Loading dataset {path} from {self.data_source} > subset: {self.subset} > split: {self.split} ...'
86
+ )
87
+ if self.data_source == HubType.MODELSCOPE:
88
+ dataset = MsDataset.load(
89
+ dataset_name=path,
90
+ split=self.split,
91
+ subset_name=self.subset,
92
+ version=self.version,
93
+ trust_remote_code=self.trust_remote,
94
+ **self.kwargs,
95
+ )
96
+ # convert to Huggingface dataset if necessary
97
+ if not isinstance(dataset, datasets.Dataset):
98
+ dataset = dataset.to_hf_dataset()
99
+ elif self.data_source in [HubType.HUGGINGFACE, HubType.LOCAL]:
100
+ # remove dataset_infos.json file if exists, since datasets will occur an error if it exists.
101
+ dataset_infos_path = os.path.join(path, 'dataset_infos.json')
102
+ if os.path.exists(dataset_infos_path):
103
+ logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid datasets errors.')
104
+ os.remove(dataset_infos_path)
105
+ # load dataset from Huggingface or local path
106
+ dataset = datasets.load_dataset(
107
+ path=path,
108
+ name=self.subset if self.subset != 'default' else None,
109
+ split=self.split,
110
+ revision=self.version,
111
+ trust_remote_code=self.trust_remote,
112
+ **self.kwargs,
113
+ )
114
+
115
+ # Only save to disk if not loading from local path
116
+ if self.data_source != HubType.LOCAL:
117
+ dataset.save_to_disk(dataset_cache_dir)
118
+
119
+ # shuffle if requested
120
+ if self.shuffle:
121
+ dataset = dataset.shuffle(seed=self.seed)
122
+
123
+ # limit if requested
124
+ if self.limit:
125
+ if isinstance(self.limit, float):
126
+ self.limit = int(len(dataset) * self.limit)
127
+ elif isinstance(self.limit, int) and self.limit < 0:
128
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
129
+ dataset = dataset.select(range(self.limit))
130
+
131
+ # convert to list
132
+ dataset = dataset.to_list()
133
+
134
+ # repeat k times
135
+ if self.repeats > 1:
136
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
137
+
138
+ # return the dataset
139
+ memory_dataset = MemoryDataset(
140
+ samples=data_to_samples(data=dataset, data_to_sample=data_to_sample),
141
+ name=Path(path).stem if Path(path).exists() else path,
142
+ location=path,
143
+ )
144
+
145
+ # Apply filtering if a filter function is provided
146
+ if self.filter_func is not None:
147
+ memory_dataset = memory_dataset.filter(self.filter_func)
148
+
149
+ # assign ids and group_ids if requested
150
+ if self.auto_id:
151
+ memory_dataset.reindex(group_size=self.repeats)
152
+
153
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
154
+
155
+ return memory_dataset
156
+
157
+
158
+ class LocalDataLoader(DataLoader):
159
+ """
160
+ Data loader for local datasets. Reads from JSONL or CSV files.
161
+ """
162
+
163
+ def load(self):
164
+
165
+ path = self.data_id_or_path
166
+ data_to_sample = record_to_sample_fn(self.sample_fields)
167
+ dataset = []
168
+
169
+ # Check for JSONL or CSV files in the specified path
170
+ for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
171
+ # Check if the file exists with the given extension
172
+ if os.path.isfile(path) and path.endswith(ext):
173
+ file_paths = [path]
174
+ else:
175
+ file_paths = [
176
+ os.path.join(path, f'{self.subset}_{self.split}{ext}'),
177
+ os.path.join(path, f'{self.subset}{ext}')
178
+ ]
179
+ # If the file exists, load it
180
+ for file_path in file_paths:
181
+ if os.path.exists(file_path):
182
+ dataset = loader(file_path)
183
+ break # Stop checking other extensions once a file is found
184
+
185
+ # shuffle if requested
186
+ if self.shuffle:
187
+ random.shuffle(dataset, self.seed)
188
+
189
+ # limit if requested
190
+ if self.limit:
191
+ if isinstance(self.limit, float):
192
+ self.limit = int(len(dataset) * self.limit)
193
+ elif isinstance(self.limit, int) and self.limit < 0:
194
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
195
+ dataset = dataset[:self.limit]
196
+
197
+ # repeat k times
198
+ if self.repeats > 1:
199
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
200
+
201
+ # return the dataset
202
+ memory_dataset = MemoryDataset(
203
+ samples=data_to_samples(data=dataset, data_to_sample=data_to_sample),
204
+ name=Path(path).stem if Path(path).exists() else path,
205
+ location=path,
206
+ )
207
+
208
+ # Apply filtering if a filter function is provided
209
+ if self.filter_func is not None:
210
+ memory_dataset = memory_dataset.filter(self.filter_func)
211
+
212
+ # assign ids and group_ids if requested
213
+ if self.auto_id:
214
+ memory_dataset.reindex(group_size=self.repeats)
215
+
216
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
217
+
218
+ return memory_dataset
219
+
220
+
221
+ class DictDataLoader(DataLoader):
222
+ """Load dataset from a list of dictionaries."""
223
+
224
+ def __init__(self, dict_list: list, **kwargs):
225
+ super().__init__(data_id_or_path='', split='', **kwargs)
226
+ self.dict_list = dict_list
227
+
228
+ def load(self) -> Dataset:
229
+ data_to_sample = record_to_sample_fn(self.sample_fields)
230
+ dataset = self.dict_list
231
+
232
+ # shuffle if requested
233
+ if self.shuffle:
234
+ random.shuffle(dataset, self.seed)
235
+
236
+ # limit if requested
237
+ if self.limit:
238
+ if isinstance(self.limit, float):
239
+ self.limit = int(len(dataset) * self.limit)
240
+ elif isinstance(self.limit, int) and self.limit < 0:
241
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
242
+ dataset = dataset[:self.limit]
243
+
244
+ # repeat k times
245
+ if self.repeats > 1:
246
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
247
+
248
+ # return the dataset
249
+ memory_dataset = MemoryDataset(samples=data_to_samples(data=dataset, data_to_sample=data_to_sample), )
250
+
251
+ # Apply filtering if a filter function is provided
252
+ if self.filter_func is not None:
253
+ memory_dataset = memory_dataset.filter(self.filter_func)
254
+
255
+ # assign ids and group_ids if requested
256
+ if self.auto_id:
257
+ memory_dataset.reindex(group_size=self.repeats)
258
+
259
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
260
+
261
+ return memory_dataset