evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,262 @@
1
+ import copy
2
+ import os
3
+ import random
4
+ from abc import ABC, abstractmethod
5
+ from pathlib import Path
6
+ from typing import Callable, Dict, List, Optional, Union
7
+
8
+ from evalscope.api.dataset.utils import record_to_sample_fn
9
+ from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, HubType
10
+ from evalscope.utils import get_logger
11
+ from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename
12
+ from .dataset import Dataset, FieldSpec, MemoryDataset, Sample
13
+ from .utils import data_to_samples, shuffle_choices_if_requested
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ class DataLoader(ABC):
19
+ """
20
+ Abstract base class for data loaders.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ data_id_or_path: str,
26
+ split: str,
27
+ sample_fields: Union[FieldSpec, Callable] = None,
28
+ filter_func: Callable = None,
29
+ subset: str = 'default',
30
+ version: str = None,
31
+ limit: Union[int, float] = None,
32
+ data_source: Optional[str] = None,
33
+ shuffle: bool = False,
34
+ shuffle_choices: Optional[Union[bool, int]] = None,
35
+ seed: Optional[int] = None,
36
+ auto_id: bool = True,
37
+ repeats: int = 1,
38
+ trust_remote: bool = True,
39
+ **kwargs
40
+ ):
41
+ self.data_id_or_path = data_id_or_path
42
+ self.split = split
43
+ self.sample_fields = sample_fields
44
+ self.filter_func = filter_func
45
+ self.subset = subset
46
+ self.version = version
47
+ self.limit = limit
48
+ self.data_source = data_source
49
+ self.shuffle = shuffle
50
+ self.shuffle_choices = shuffle_choices
51
+ self.seed = seed
52
+ self.auto_id = auto_id
53
+ self.repeats = repeats
54
+ self.trust_remote = trust_remote
55
+ self.kwargs = kwargs
56
+
57
+ @abstractmethod
58
+ def load(self) -> Dataset:
59
+ """
60
+ Load data from the source.
61
+ """
62
+ ...
63
+
64
+
65
+ class RemoteDataLoader(DataLoader):
66
+ """
67
+ Data loader for remote datasets: ModelScope or Huggingface.
68
+ """
69
+
70
+ def load(self) -> Dataset:
71
+ import datasets
72
+ from modelscope import MsDataset
73
+
74
+ path = self.data_id_or_path
75
+ # resolve data_to_sample function
76
+ data_to_sample = record_to_sample_fn(self.sample_fields)
77
+ # generate a unique cache dir for this dataset
78
+ dataset_hash = gen_hash(f'{path}{self.split}{self.subset}{self.version}{self.kwargs}')
79
+ datasets_cache_dir = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'datasets')
80
+ dataset_cache_dir = os.path.join(datasets_cache_dir, f'{safe_filename(path)}-{dataset_hash}')
81
+ if os.path.exists(dataset_cache_dir):
82
+ dataset = datasets.load_from_disk(dataset_cache_dir)
83
+ else:
84
+ logger.info(
85
+ f'Loading dataset {path} from {self.data_source} > subset: {self.subset} > split: {self.split} ...'
86
+ )
87
+ if self.data_source == HubType.MODELSCOPE:
88
+ dataset = MsDataset.load(
89
+ dataset_name=path,
90
+ split=self.split,
91
+ subset_name=self.subset,
92
+ version=self.version,
93
+ trust_remote_code=self.trust_remote,
94
+ **self.kwargs,
95
+ )
96
+ # convert to Huggingface dataset if necessary
97
+ if not isinstance(dataset, datasets.Dataset):
98
+ dataset = dataset.to_hf_dataset()
99
+ elif self.data_source in [HubType.HUGGINGFACE, HubType.LOCAL]:
100
+ # remove dataset_infos.json file if exists, since datasets will occur an error if it exists.
101
+ dataset_infos_path = os.path.join(path, 'dataset_infos.json')
102
+ if os.path.exists(dataset_infos_path):
103
+ logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid datasets errors.')
104
+ os.remove(dataset_infos_path)
105
+ # load dataset from Huggingface or local path
106
+ dataset = datasets.load_dataset(
107
+ path=path,
108
+ name=self.subset if self.subset != 'default' else None,
109
+ split=self.split,
110
+ revision=self.version,
111
+ trust_remote_code=self.trust_remote,
112
+ **self.kwargs,
113
+ )
114
+
115
+ # Only save to disk if not loading from local path
116
+ if self.data_source != HubType.LOCAL:
117
+ dataset.save_to_disk(dataset_cache_dir)
118
+
119
+ # shuffle if requested
120
+ if self.shuffle:
121
+ dataset = dataset.shuffle(seed=self.seed)
122
+
123
+ # limit if requested
124
+ if self.limit:
125
+ if isinstance(self.limit, float):
126
+ self.limit = int(len(dataset) * self.limit)
127
+ elif isinstance(self.limit, int) and self.limit < 0:
128
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
129
+ if len(dataset) > self.limit:
130
+ dataset = dataset.select(range(self.limit))
131
+
132
+ # convert to list
133
+ dataset = dataset.to_list()
134
+
135
+ # repeat k times
136
+ if self.repeats > 1:
137
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
138
+
139
+ # return the dataset
140
+ memory_dataset = MemoryDataset(
141
+ samples=data_to_samples(data=dataset, data_to_sample=data_to_sample),
142
+ name=Path(path).stem if Path(path).exists() else path,
143
+ location=path,
144
+ )
145
+
146
+ # Apply filtering if a filter function is provided
147
+ if self.filter_func is not None:
148
+ memory_dataset = memory_dataset.filter(self.filter_func)
149
+
150
+ # assign ids and group_ids if requested
151
+ if self.auto_id:
152
+ memory_dataset.reindex(group_size=self.repeats)
153
+
154
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
155
+
156
+ return memory_dataset
157
+
158
+
159
+ class LocalDataLoader(DataLoader):
160
+ """
161
+ Data loader for local datasets. Reads from JSONL or CSV files.
162
+ """
163
+
164
+ def load(self):
165
+
166
+ path = self.data_id_or_path
167
+ data_to_sample = record_to_sample_fn(self.sample_fields)
168
+ dataset = []
169
+
170
+ # Check for JSONL or CSV files in the specified path
171
+ for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
172
+ # Check if the file exists with the given extension
173
+ if os.path.isfile(path) and path.endswith(ext):
174
+ file_paths = [path]
175
+ else:
176
+ file_paths = [
177
+ os.path.join(path, f'{self.subset}_{self.split}{ext}'),
178
+ os.path.join(path, f'{self.subset}{ext}')
179
+ ]
180
+ # If the file exists, load it
181
+ for file_path in file_paths:
182
+ if os.path.exists(file_path):
183
+ dataset = loader(file_path)
184
+ break # Stop checking other extensions once a file is found
185
+
186
+ # shuffle if requested
187
+ if self.shuffle:
188
+ random.shuffle(dataset, self.seed)
189
+
190
+ # limit if requested
191
+ if self.limit:
192
+ if isinstance(self.limit, float):
193
+ self.limit = int(len(dataset) * self.limit)
194
+ elif isinstance(self.limit, int) and self.limit < 0:
195
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
196
+ dataset = dataset[:self.limit]
197
+
198
+ # repeat k times
199
+ if self.repeats > 1:
200
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
201
+
202
+ # return the dataset
203
+ memory_dataset = MemoryDataset(
204
+ samples=data_to_samples(data=dataset, data_to_sample=data_to_sample),
205
+ name=Path(path).stem if Path(path).exists() else path,
206
+ location=path,
207
+ )
208
+
209
+ # Apply filtering if a filter function is provided
210
+ if self.filter_func is not None:
211
+ memory_dataset = memory_dataset.filter(self.filter_func)
212
+
213
+ # assign ids and group_ids if requested
214
+ if self.auto_id:
215
+ memory_dataset.reindex(group_size=self.repeats)
216
+
217
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
218
+
219
+ return memory_dataset
220
+
221
+
222
+ class DictDataLoader(DataLoader):
223
+ """Load dataset from a list of dictionaries."""
224
+
225
+ def __init__(self, dict_list: list, **kwargs):
226
+ super().__init__(data_id_or_path='', split='', **kwargs)
227
+ self.dict_list = dict_list
228
+
229
+ def load(self) -> Dataset:
230
+ data_to_sample = record_to_sample_fn(self.sample_fields)
231
+ dataset = self.dict_list
232
+
233
+ # shuffle if requested
234
+ if self.shuffle:
235
+ random.shuffle(dataset, self.seed)
236
+
237
+ # limit if requested
238
+ if self.limit:
239
+ if isinstance(self.limit, float):
240
+ self.limit = int(len(dataset) * self.limit)
241
+ elif isinstance(self.limit, int) and self.limit < 0:
242
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
243
+ dataset = dataset[:self.limit]
244
+
245
+ # repeat k times
246
+ if self.repeats > 1:
247
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
248
+
249
+ # return the dataset
250
+ memory_dataset = MemoryDataset(samples=data_to_samples(data=dataset, data_to_sample=data_to_sample), )
251
+
252
+ # Apply filtering if a filter function is provided
253
+ if self.filter_func is not None:
254
+ memory_dataset = memory_dataset.filter(self.filter_func)
255
+
256
+ # assign ids and group_ids if requested
257
+ if self.auto_id:
258
+ memory_dataset.reindex(group_size=self.repeats)
259
+
260
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
261
+
262
+ return memory_dataset
@@ -0,0 +1,143 @@
1
+ import json
2
+ from tqdm import tqdm
3
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast
4
+
5
+ from .dataset import Dataset, FieldSpec, Sample
6
+
7
+
8
+ def record_to_sample_fn(sample_fields: Union[FieldSpec, Callable, None] = None, ) -> Callable:
9
+ if sample_fields is None:
10
+ sample_fields = FieldSpec()
11
+
12
+ if isinstance(sample_fields, FieldSpec):
13
+
14
+ def record_to_sample(record: dict) -> Sample:
15
+ # collect metadata if specified
16
+ metadata: Optional[Dict[str, Any]] = None
17
+ if sample_fields.metadata:
18
+ if isinstance(sample_fields.metadata, list):
19
+ metadata = {}
20
+ for name in sample_fields.metadata:
21
+ metadata[name] = record.get(name)
22
+
23
+ elif 'metadata' in record:
24
+ metadata_field = record.get('metadata')
25
+ if isinstance(metadata_field, str):
26
+ metadata = json.loads(metadata_field)
27
+ elif isinstance(metadata_field, dict):
28
+ metadata = metadata_field
29
+ else:
30
+ raise ValueError(f"Unexpected type for 'metadata' field: {type(metadata_field)}")
31
+
32
+ # return sample
33
+ return Sample(
34
+ input=read_input(record.get(sample_fields.input)),
35
+ target=read_target(record.get(sample_fields.target)),
36
+ choices=read_choices(record.get(sample_fields.choices)),
37
+ id=record.get(sample_fields.id, None),
38
+ metadata=metadata,
39
+ sandbox=read_sandbox(record.get(sample_fields.sandbox)),
40
+ files=read_files(record.get(sample_fields.files)),
41
+ setup=read_setup(record.get(sample_fields.setup)),
42
+ )
43
+
44
+ return record_to_sample
45
+
46
+ else:
47
+ return sample_fields
48
+
49
+
50
+ def data_to_samples(data: Iterable[dict], data_to_sample: Callable) -> List[Sample]:
51
+ samples: List[Sample] = []
52
+ for record in tqdm(data, desc='Processing records'):
53
+ record_samples = as_sample_list(data_to_sample(record=record))
54
+ samples.extend(record_samples)
55
+ return samples
56
+
57
+
58
+ def as_sample_list(samples: Union[Sample, List[Sample]]) -> List[Sample]:
59
+ if isinstance(samples, list):
60
+ return samples
61
+ else:
62
+ return [samples]
63
+
64
+
65
+ def read_input(input_val: Optional[Any]) -> str:
66
+ if not input_val:
67
+ raise ValueError('No input in dataset')
68
+ return str(input_val)
69
+
70
+
71
+ def read_target(obj: Optional[Any]) -> Union[str, List[str]]:
72
+ if obj is not None:
73
+ return [str(item) for item in obj] if isinstance(obj, list) else str(obj)
74
+ else:
75
+ return ''
76
+
77
+
78
+ def read_choices(obj: Optional[Any]) -> Optional[List[str]]:
79
+ if obj is not None:
80
+ if isinstance(obj, list):
81
+ return [str(choice) for choice in obj]
82
+ elif isinstance(obj, str):
83
+ choices = obj.split(',')
84
+ if len(choices) == 1:
85
+ choices = obj.split()
86
+ return [choice.strip() for choice in choices]
87
+ else:
88
+ return [str(obj)]
89
+ else:
90
+ return None
91
+
92
+
93
+ def read_setup(setup: Optional[Any]) -> Optional[str]:
94
+ if setup is not None:
95
+ return str(setup)
96
+ else:
97
+ return None
98
+
99
+
100
+ def read_sandbox(sandbox: Optional[Any]) -> Optional[str]:
101
+ if sandbox is not None:
102
+ if isinstance(sandbox, str):
103
+ return sandbox
104
+ elif isinstance(sandbox, dict):
105
+ return json.dumps(sandbox)
106
+ else:
107
+ raise ValueError(f"Unexpected type for 'sandbox' field: {type(sandbox)}")
108
+ else:
109
+ return None
110
+
111
+
112
+ def read_files(files: Optional[Any]) -> Optional[Dict[str, str]]:
113
+ if files is not None:
114
+ if isinstance(files, str):
115
+ files = json.loads(files)
116
+ if isinstance(files, dict):
117
+ if all(isinstance(v, str) for v in files.values()):
118
+ return cast(Dict[str, str], files)
119
+
120
+ # didn't find the right type
121
+ raise ValueError(f"Unexpected type for 'files' field: {type(files)}")
122
+ else:
123
+ return None
124
+
125
+
126
+ def shuffle_choices_if_requested(dataset: Dataset, shuffle_choices: Optional[Union[bool, int]]) -> None:
127
+ """
128
+ Shuffle the choices in the dataset if requested.
129
+
130
+ The `shuffle_choices` parameter passed to `json_dataset`, `csv_dataset`,
131
+ and `hf_dataset` can be a boolean, an integer, or `None` (default).
132
+ If it is a boolean, it will shuffle the choices if the value is `True`,
133
+ and do nothing if it is `False`.
134
+ If it is an integer, it will shuffle the choices using the integer as the seed.
135
+ """
136
+ # Note that `isinstance(x, int)` returns True if x is True or False,
137
+ # so we need to check for both explicitly
138
+ if shuffle_choices is True:
139
+ dataset.shuffle_choices()
140
+ elif shuffle_choices is False:
141
+ pass
142
+ elif isinstance(shuffle_choices, int):
143
+ dataset.shuffle_choices(seed=shuffle_choices)
@@ -0,0 +1,3 @@
1
+ from .cache import CacheManager, ModelResult, ReviewResult
2
+ from .evaluator import Evaluator
3
+ from .state import Choices, Target, TaskState