evalscope 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (148) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +7 -4
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  6. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  7. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  8. evalscope/api/benchmark/benchmark.py +62 -2
  9. evalscope/api/benchmark/meta.py +9 -0
  10. evalscope/api/dataset/dataset.py +6 -6
  11. evalscope/api/dataset/loader.py +2 -1
  12. evalscope/api/evaluator/cache.py +24 -1
  13. evalscope/api/evaluator/evaluator.py +5 -0
  14. evalscope/api/evaluator/state.py +17 -1
  15. evalscope/api/messages/__init__.py +1 -0
  16. evalscope/api/messages/chat_message.py +52 -2
  17. evalscope/api/metric/scorer.py +15 -7
  18. evalscope/api/mixin/__init__.py +1 -1
  19. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  20. evalscope/api/mixin/sandbox_mixin.py +204 -0
  21. evalscope/api/model/generate_config.py +1 -6
  22. evalscope/api/model/model.py +5 -2
  23. evalscope/api/tool/tool_info.py +1 -1
  24. evalscope/app/app.py +3 -0
  25. evalscope/app/ui/single_model.py +3 -3
  26. evalscope/app/utils/data_utils.py +7 -7
  27. evalscope/app/utils/env_utils.py +12 -0
  28. evalscope/app/utils/text_utils.py +14 -12
  29. evalscope/arguments.py +8 -4
  30. evalscope/backend/opencompass/backend_manager.py +0 -2
  31. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  32. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  33. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  34. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  35. evalscope/benchmarks/bfcl/bfcl_adapter.py +142 -7
  36. evalscope/benchmarks/bfcl/generation.py +9 -9
  37. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  38. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  39. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  40. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  41. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  42. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  43. evalscope/benchmarks/healthbench/utils.py +102 -0
  44. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  45. evalscope/benchmarks/humaneval/utils.py +235 -0
  46. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  47. evalscope/benchmarks/image_edit/__init__.py +0 -0
  48. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  49. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  50. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  51. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  52. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  53. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  54. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  55. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  56. evalscope/benchmarks/math_vista/__init__.py +0 -0
  57. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  58. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  59. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  60. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  61. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  62. evalscope/benchmarks/mm_star/__init__.py +0 -0
  63. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  64. evalscope/benchmarks/mmmu/__init__.py +0 -0
  65. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  66. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  67. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  68. evalscope/benchmarks/multi_if/__init__.py +0 -0
  69. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  70. evalscope/benchmarks/multi_if/metrics.py +120 -0
  71. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  72. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +6 -5
  73. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  74. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  75. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  76. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  77. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  78. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  79. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  80. evalscope/benchmarks/tau_bench/generation.py +1 -1
  81. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +20 -19
  82. evalscope/benchmarks/text2image/__init__.py +0 -0
  83. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  84. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  85. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  86. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  87. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  88. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  89. evalscope/cli/start_app.py +7 -1
  90. evalscope/cli/start_perf.py +7 -1
  91. evalscope/config.py +96 -14
  92. evalscope/constants.py +11 -0
  93. evalscope/evaluator/evaluator.py +30 -10
  94. evalscope/metrics/llm_judge.py +19 -7
  95. evalscope/metrics/metric.py +27 -2
  96. evalscope/models/image_edit_model.py +125 -0
  97. evalscope/models/model_apis.py +22 -0
  98. evalscope/models/openai_compatible.py +3 -0
  99. evalscope/models/text2image_model.py +2 -2
  100. evalscope/models/utils/openai.py +8 -6
  101. evalscope/perf/arguments.py +2 -0
  102. evalscope/perf/benchmark.py +2 -0
  103. evalscope/perf/plugin/api/base.py +2 -2
  104. evalscope/perf/plugin/api/default_api.py +7 -7
  105. evalscope/perf/plugin/api/openai_api.py +83 -19
  106. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  107. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  108. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  109. evalscope/perf/utils/benchmark_util.py +7 -5
  110. evalscope/perf/utils/local_server.py +3 -0
  111. evalscope/report/__init__.py +0 -1
  112. evalscope/report/combinator.py +0 -25
  113. evalscope/report/generator.py +8 -87
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +9 -5
  116. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  117. evalscope/utils/chat_service.py +1 -1
  118. evalscope/utils/function_utils.py +41 -0
  119. evalscope/utils/import_utils.py +73 -1
  120. evalscope/utils/io_utils.py +56 -7
  121. evalscope/utils/json_schema.py +23 -2
  122. evalscope/utils/logger.py +19 -0
  123. evalscope/utils/model_utils.py +4 -3
  124. evalscope/utils/multi_choices.py +23 -6
  125. evalscope/version.py +2 -2
  126. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/METADATA +17 -24
  127. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/RECORD +145 -103
  128. tests/benchmark/test_eval.py +80 -37
  129. tests/benchmark/test_image_edit.py +65 -0
  130. tests/benchmark/test_sandbox.py +81 -0
  131. tests/benchmark/test_vlm.py +137 -0
  132. tests/cli/test_all.py +83 -43
  133. tests/cli/test_collection.py +8 -5
  134. tests/cli/test_reasoning.py +81 -0
  135. tests/common.py +73 -0
  136. tests/perf/test_perf.py +44 -14
  137. tests/rag/test_clip_benchmark.py +0 -3
  138. evalscope/api/mixin/dataset_mixin.py +0 -105
  139. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  140. tests/aigc/__init__.py +0 -1
  141. /evalscope/benchmarks/{aigc → ai2d}/__init__.py +0 -0
  142. /evalscope/benchmarks/{aigc/i2i → amc}/__init__.py +0 -0
  143. /evalscope/benchmarks/{aigc/t2i → healthbench}/__init__.py +0 -0
  144. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  145. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  146. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  147. {evalscope-1.0.0.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
  148. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -0,0 +1,81 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+ from unittest import TestCase
8
+
9
+ from evalscope.config import TaskConfig
10
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
+ from evalscope.run import run_task
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ class TestReasoning(TestCase):
18
+ """Benchmark evaluation test cases."""
19
+
20
+ def setUp(self):
21
+ """Setup common test configuration."""
22
+ self.base_config = {
23
+ 'model': 'Qwen3-0.6B',
24
+ 'api_url': 'http://0.0.0.0:8801/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'eval_type': EvalType.SERVICE,
27
+ 'eval_batch_size': 5,
28
+ 'limit': 5,
29
+ 'generation_config': {
30
+ 'max_tokens': 4096,
31
+ 'temperature': 0.0,
32
+ 'seed': 42,
33
+ 'parallel_tool_calls': True,
34
+ 'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
35
+ },
36
+ 'judge_strategy': JudgeStrategy.AUTO,
37
+ 'judge_worker_num': 5,
38
+ 'judge_model_args': {
39
+ 'model_id': 'qwen2.5-72b-instruct',
40
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
41
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
42
+ 'generation_config': {
43
+ 'temperature': 0.0,
44
+ 'max_tokens': 4096,
45
+ }
46
+ },
47
+ 'debug': True,
48
+ }
49
+
50
+ def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
51
+ """Helper method to run test for a specific dataset."""
52
+ config = self.base_config.copy()
53
+ config['datasets'] = [dataset_name]
54
+
55
+ if use_mock:
56
+ config['eval_type'] = EvalType.MOCK_LLM
57
+
58
+ # 应用配置覆盖
59
+ config.update(config_overrides)
60
+
61
+ if dataset_args:
62
+ config['dataset_args'] = {dataset_name: dataset_args}
63
+
64
+ task_cfg = TaskConfig(**config)
65
+ run_task(task_cfg=task_cfg)
66
+
67
+ def _run_dataset_load_test(self, dataset_name, dataset_args=None):
68
+ """Helper method to test dataset loading."""
69
+
70
+ self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
71
+
72
+ # Math & Reasoning datasets
73
+ def test_gsm8k(self):
74
+ """Test GSM8K math reasoning dataset."""
75
+ self._run_dataset_test('gsm8k')
76
+
77
+
78
+ if __name__ == '__main__':
79
+ # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
80
+ # Run all tests: python -m unittest test_eval.TestBenchmark
81
+ unittest.main()
tests/common.py ADDED
@@ -0,0 +1,73 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+ from unittest import TestCase
8
+
9
+ from evalscope.config import TaskConfig
10
+ from evalscope.constants import EvalType, JudgeStrategy
11
+ from evalscope.run import run_task
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ class TestBenchmark(TestCase):
18
+ """Benchmark evaluation test cases."""
19
+
20
+ def setUp(self):
21
+ """Setup common test configuration."""
22
+ self.base_config = {
23
+ 'model': 'qwen-plus',
24
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'eval_type': EvalType.SERVICE,
27
+ 'eval_batch_size': 5,
28
+ 'limit': 5,
29
+ 'generation_config': {
30
+ 'max_tokens': 4096,
31
+ 'temperature': 0.0,
32
+ 'seed': 42,
33
+ 'parallel_tool_calls': True
34
+ },
35
+ 'judge_strategy': JudgeStrategy.AUTO,
36
+ 'judge_worker_num': 5,
37
+ 'judge_model_args': {
38
+ 'model_id': 'qwen2.5-72b-instruct',
39
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
41
+ 'generation_config': {
42
+ 'temperature': 0.0,
43
+ 'max_tokens': 4096,
44
+ }
45
+ },
46
+ 'debug': True,
47
+ }
48
+
49
+ def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
+ """Helper method to run test for a specific dataset."""
51
+ config = self.base_config.copy()
52
+ config['datasets'] = [dataset_name]
53
+
54
+ if not env.get('DASHSCOPE_API_KEY'):
55
+ use_mock = True
56
+ logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
57
+
58
+ if use_mock:
59
+ config['eval_type'] = EvalType.MOCK_LLM
60
+
61
+ # 应用配置覆盖
62
+ config.update(config_overrides)
63
+
64
+ if dataset_args:
65
+ config['dataset_args'] = {dataset_name: dataset_args}
66
+
67
+ task_cfg = TaskConfig(**config)
68
+ run_task(task_cfg=task_cfg)
69
+
70
+ def _run_dataset_load_test(self, dataset_name, dataset_args=None):
71
+ """Helper method to test dataset loading."""
72
+
73
+ self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
tests/perf/test_perf.py CHANGED
@@ -1,9 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from dotenv import dotenv_values
4
3
 
5
4
  env = dotenv_values('.env')
6
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
5
  import unittest
8
6
 
9
7
  from evalscope.perf.main import run_perf_benchmark
@@ -18,7 +16,7 @@ class TestPerf(unittest.TestCase):
18
16
  def tearDown(self) -> None:
19
17
  pass
20
18
 
21
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
19
+
22
20
  def test_run_perf(self):
23
21
  task_cfg = {
24
22
  'url': 'http://127.0.0.1:8001/v1/chat/completions',
@@ -32,7 +30,7 @@ class TestPerf(unittest.TestCase):
32
30
  }
33
31
  run_perf_benchmark(task_cfg)
34
32
 
35
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
33
+
36
34
  def test_run_perf_stream(self):
37
35
  task_cfg = {
38
36
  'url': 'http://127.0.0.1:8801/v1/chat/completions',
@@ -46,7 +44,7 @@ class TestPerf(unittest.TestCase):
46
44
  }
47
45
  run_perf_benchmark(task_cfg)
48
46
 
49
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
47
+
50
48
  def test_run_perf_speed_benchmark(self):
51
49
  task_cfg = {
52
50
  'url': 'http://127.0.0.1:8001/v1/completions',
@@ -60,7 +58,7 @@ class TestPerf(unittest.TestCase):
60
58
  }
61
59
  run_perf_benchmark(task_cfg)
62
60
 
63
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
61
+
64
62
  def test_run_perf_local(self):
65
63
  task_cfg = {
66
64
  'parallel': 1,
@@ -72,7 +70,7 @@ class TestPerf(unittest.TestCase):
72
70
  }
73
71
  run_perf_benchmark(task_cfg)
74
72
 
75
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
73
+
76
74
  def test_run_perf_local_stream(self):
77
75
  task_cfg = {
78
76
  'parallel': 1,
@@ -85,7 +83,7 @@ class TestPerf(unittest.TestCase):
85
83
  }
86
84
  run_perf_benchmark(task_cfg)
87
85
 
88
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
86
+
89
87
  def test_run_perf_local_speed_benchmark(self):
90
88
  task_cfg = {
91
89
  'parallel': 1,
@@ -98,7 +96,7 @@ class TestPerf(unittest.TestCase):
98
96
  }
99
97
  run_perf_benchmark(task_cfg)
100
98
 
101
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
99
+
102
100
  def test_run_perf_local_random(self):
103
101
  from evalscope.perf.arguments import Arguments
104
102
  task_cfg = Arguments(
@@ -121,13 +119,45 @@ class TestPerf(unittest.TestCase):
121
119
  print(metrics_result)
122
120
  print(percentile_result)
123
121
 
124
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
122
+ def test_run_completion_endpoint(self):
123
+ if not env.get('DASHSCOPE_API_KEY'):
124
+ self.skipTest('DASHSCOPE_API_KEY is not set.')
125
+ return
126
+
127
+ from evalscope.perf.arguments import Arguments
128
+ task_cfg = Arguments(
129
+ parallel=[1, 2],
130
+ number=[2, 4],
131
+ model='qwen2.5-coder-7b-instruct',
132
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/completions',
133
+ api_key=env.get('DASHSCOPE_API_KEY'),
134
+ api='openai',
135
+ dataset='random',
136
+ min_tokens=100,
137
+ max_tokens=100,
138
+ prefix_length=0,
139
+ min_prompt_length=1024,
140
+ max_prompt_length=1024,
141
+ stream=False,
142
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
143
+ seed=None,
144
+ extra_args={'ignore_eos': True}
145
+ )
146
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
147
+ print(metrics_result)
148
+ print(percentile_result)
149
+
150
+
125
151
  def test_run_perf_multi_parallel(self):
152
+ if not env.get('DASHSCOPE_API_KEY'):
153
+ self.skipTest('DASHSCOPE_API_KEY is not set.')
154
+ return
155
+
126
156
  from evalscope.perf.arguments import Arguments
127
157
  task_cfg = Arguments(
128
158
  parallel=[1, 2],
129
159
  number=[2, 4],
130
- model='qwen2.5-7b-instruct',
160
+ model='qwen-plus',
131
161
  url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
132
162
  api_key=env.get('DASHSCOPE_API_KEY'),
133
163
  api='openai',
@@ -145,7 +175,7 @@ class TestPerf(unittest.TestCase):
145
175
  print(metrics_result)
146
176
  print(percentile_result)
147
177
 
148
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
178
+
149
179
  def test_run_perf_random_vl(self):
150
180
  from evalscope.perf.arguments import Arguments
151
181
  task_cfg = Arguments(
@@ -155,7 +185,7 @@ class TestPerf(unittest.TestCase):
155
185
  url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
156
186
  api_key=env.get('DASHSCOPE_API_KEY'),
157
187
  api='openai',
158
- dataset='kontext_bench',
188
+ dataset='random_vl',
159
189
  min_tokens=100,
160
190
  max_tokens=100,
161
191
  prefix_length=0,
@@ -164,7 +194,7 @@ class TestPerf(unittest.TestCase):
164
194
  image_height=512,
165
195
  image_width=512,
166
196
  image_num=2,
167
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
197
+ tokenizer_path='Qwen/Qwen2.5-VL-7B-Instruct',
168
198
  seed=None,
169
199
  extra_args={'ignore_eos': True}
170
200
  )
@@ -1,8 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
2
  import os
4
-
5
- # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
6
3
  import subprocess
7
4
  import unittest
8
5
 
@@ -1,105 +0,0 @@
1
- from abc import ABC
2
- from collections import defaultdict
3
- from typing import Any, Callable, Dict
4
-
5
- from evalscope.api.dataset import Dataset, DatasetDict, RemoteDataLoader
6
-
7
-
8
- class DatasetLoaderMixin:
9
- """
10
- Mixin class providing dataset loading functionality for benchmarks.
11
-
12
- This mixin provides common dataset loading methods that can be shared
13
- across different data adapters, including support for:
14
- - Loading multiple subsets
15
- - Few-shot dataset loading
16
- - Remote dataset loading with configuration
17
- """
18
-
19
- def load_subsets(self, load_func: Callable[[str], Dataset]) -> DatasetDict:
20
- """
21
- Load multiple subsets of the dataset using the provided loading function.
22
-
23
- This method handles two loading strategies:
24
- 1. Reformat mode: Load only the default subset and reformat it
25
- 2. Multi-subset mode: Load all subsets specified in subset_list
26
-
27
- Args:
28
- load_func (Callable[[str], Dataset]): Function to load individual subsets
29
-
30
- Returns:
31
- DatasetDict: Dictionary containing all loaded subsets
32
- """
33
- if self.reformat_subset:
34
- # Load only the default subset
35
- subset_data = load_func(self.default_subset)
36
- # Reformat the subset to create multiple subsets based on sample keys
37
- # NOTE: subset_list and limit is applied here if specified
38
- dataset_dict = DatasetDict.from_dataset(dataset=subset_data, subset_list=self.subset_list, limit=self.limit)
39
- else:
40
- # Load all specified subsets into separate entries
41
- subset_dict = defaultdict()
42
- for subset in self.subset_list:
43
- subset_data = load_func(subset)
44
- subset_dict[subset] = subset_data
45
- dataset_dict = DatasetDict(subset_dict)
46
- return dataset_dict
47
-
48
- def load_subset(self, subset: str) -> Dataset:
49
- """
50
- Load a specific subset of the dataset for evaluation.
51
-
52
- This method configures and executes the data loading for a single subset,
53
- handling both split-as-subset and traditional subset configurations.
54
-
55
- Args:
56
- subset (str): The subset identifier to load
57
-
58
- Returns:
59
- Dataset: The loaded dataset subset with processed samples
60
- """
61
- # Determine the split and subset names based on configuration
62
- split = subset if self.split_as_subset else self.eval_split
63
- subset_name = self.default_subset if self.split_as_subset else subset
64
-
65
- # Create and configure the remote data loader
66
- loader = RemoteDataLoader(
67
- data_id_or_path=self.dataset_id,
68
- split=split,
69
- subset=subset_name,
70
- sample_fields=self.record_to_sample, # Custom sample conversion function
71
- limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
72
- repeats=self._task_config.repeats, # Number of repetitions for each sample
73
- data_source=self._task_config.dataset_hub, # Data source configuration
74
- )
75
- return loader.load()
76
-
77
- def load_fewshot_subset(self, subset: str) -> Dataset:
78
- """
79
- Load a subset specifically for few-shot examples.
80
-
81
- This method loads training data to be used as demonstrations in few-shot prompting.
82
- It typically loads from the training split with limited samples and optional shuffling.
83
-
84
- Args:
85
- subset (str): The subset identifier to load few-shot examples from
86
-
87
- Returns:
88
- Dataset: The loaded few-shot dataset with demonstration examples
89
- """
90
- # Use training split for few-shot examples
91
- split = subset if self.split_as_subset else self.train_split
92
- subset_name = self.default_subset if self.split_as_subset else subset
93
-
94
- # Create loader specifically configured for few-shot sampling
95
- loader = RemoteDataLoader(
96
- data_id_or_path=self.dataset_id,
97
- split=split,
98
- subset=subset_name,
99
- sample_fields=self.record_to_sample,
100
- limit=self.few_shot_num
101
- if not self.reformat_subset else None, # Limit to specified number of few-shot examples
102
- shuffle=self.few_shot_random, # Randomize selection if enabled
103
- data_source=self._task_config.dataset_hub,
104
- )
105
- return loader.load()
@@ -1,44 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.utils.io_utils import jsonl_to_list
7
- from evalscope.utils.logger import get_logger
8
-
9
- logger = get_logger()
10
-
11
-
12
- class GeneralI2IAdapter:
13
-
14
- def __init__(self, **kwargs):
15
-
16
- super().__init__(**kwargs)
17
-
18
- def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
19
- dataset_name_or_path = dataset_name_or_path or self.dataset_id
20
- subset_list = subset_list or self.subset_list
21
-
22
- data_file_dict = defaultdict(str)
23
- data_item_dict = defaultdict(list)
24
-
25
- # get data file path and subset name
26
- if os.path.isdir(dataset_name_or_path):
27
- for subset_name in subset_list:
28
- data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
29
- elif os.path.isfile(dataset_name_or_path):
30
- cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
31
- data_file_dict[cur_subset_name] = dataset_name_or_path
32
- else:
33
- raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
34
-
35
- # load data from local disk
36
- try:
37
- for subset_name, file_path in data_file_dict.items():
38
- data_item_dict[subset_name] = jsonl_to_list(file_path)
39
- except Exception as e:
40
- raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
41
-
42
- data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
43
-
44
- return data_dict
tests/aigc/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
File without changes
File without changes
File without changes