evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (97) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
  6. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  7. evalscope/api/benchmark/benchmark.py +35 -0
  8. evalscope/api/benchmark/meta.py +6 -0
  9. evalscope/api/dataset/dataset.py +6 -6
  10. evalscope/api/dataset/loader.py +2 -1
  11. evalscope/api/evaluator/cache.py +24 -1
  12. evalscope/api/evaluator/state.py +12 -1
  13. evalscope/api/messages/__init__.py +1 -0
  14. evalscope/api/messages/chat_message.py +47 -2
  15. evalscope/api/metric/scorer.py +15 -7
  16. evalscope/api/mixin/__init__.py +0 -1
  17. evalscope/api/model/generate_config.py +1 -3
  18. evalscope/api/model/model.py +4 -1
  19. evalscope/app/app.py +3 -0
  20. evalscope/app/ui/single_model.py +3 -3
  21. evalscope/app/utils/data_utils.py +7 -7
  22. evalscope/app/utils/env_utils.py +12 -0
  23. evalscope/app/utils/text_utils.py +14 -12
  24. evalscope/arguments.py +2 -4
  25. evalscope/backend/opencompass/backend_manager.py +0 -2
  26. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  27. evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
  28. evalscope/benchmarks/bfcl/generation.py +2 -2
  29. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  31. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  32. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  33. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  34. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  35. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  36. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  37. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  38. evalscope/benchmarks/mmmu/__init__.py +0 -0
  39. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  40. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  41. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  42. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
  43. evalscope/benchmarks/tau_bench/generation.py +1 -1
  44. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
  45. evalscope/benchmarks/text2image/__init__.py +0 -0
  46. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  47. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  48. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  49. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  50. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  51. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  52. evalscope/cli/start_app.py +7 -1
  53. evalscope/cli/start_perf.py +7 -1
  54. evalscope/config.py +72 -13
  55. evalscope/constants.py +8 -0
  56. evalscope/evaluator/evaluator.py +6 -4
  57. evalscope/metrics/llm_judge.py +19 -7
  58. evalscope/models/image_edit_model.py +125 -0
  59. evalscope/models/model_apis.py +20 -0
  60. evalscope/models/openai_compatible.py +3 -0
  61. evalscope/models/text2image_model.py +2 -2
  62. evalscope/models/utils/openai.py +7 -4
  63. evalscope/perf/benchmark.py +2 -0
  64. evalscope/perf/utils/benchmark_util.py +8 -5
  65. evalscope/perf/utils/local_server.py +3 -0
  66. evalscope/report/__init__.py +0 -1
  67. evalscope/report/generator.py +8 -87
  68. evalscope/run.py +9 -5
  69. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  70. evalscope/utils/chat_service.py +1 -1
  71. evalscope/utils/import_utils.py +23 -1
  72. evalscope/utils/io_utils.py +42 -1
  73. evalscope/utils/model_utils.py +4 -3
  74. evalscope/utils/multi_choices.py +23 -6
  75. evalscope/version.py +2 -2
  76. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
  77. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
  78. tests/benchmark/test_eval.py +30 -31
  79. tests/benchmark/test_image_edit.py +65 -0
  80. tests/benchmark/test_vlm.py +80 -0
  81. tests/cli/test_all.py +83 -43
  82. tests/cli/test_collection.py +8 -5
  83. tests/cli/test_reasoning.py +81 -0
  84. tests/common.py +73 -0
  85. tests/perf/test_perf.py +4 -2
  86. tests/rag/test_clip_benchmark.py +0 -3
  87. evalscope/api/mixin/dataset_mixin.py +0 -105
  88. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  89. tests/aigc/__init__.py +0 -1
  90. /evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
  91. /evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
  92. /evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
  93. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  94. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  95. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  96. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  97. /tests/{aigc → benchmark}/test_t2i.py +0 -0
tests/cli/test_all.py CHANGED
@@ -17,44 +17,44 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
17
17
  logger = get_logger()
18
18
 
19
19
  datasets=[
20
- 'iquiz',
21
- 'ifeval',
22
- 'mmlu',
23
- 'mmlu_pro',
24
- 'musr',
25
- 'process_bench',
26
- 'race',
27
- 'trivia_qa',
28
- 'cmmlu',
29
- 'humaneval',
30
- 'gsm8k',
31
- 'bbh',
32
- 'competition_math',
33
- 'math_500',
34
- 'aime24',
35
- 'gpqa_diamond',
36
- 'arc',
37
- 'ceval',
38
- 'hellaswag',
39
- 'general_mcq',
40
- 'general_qa',
41
- 'super_gpqa',
42
- # 'live_code_bench',
43
- 'mmlu_redux',
44
- 'simple_qa',
45
- 'chinese_simpleqa',
46
- 'alpaca_eval',
47
- 'arena_hard',
48
- 'maritime_bench',
49
- 'drop',
50
- 'winogrande',
51
- 'tool_bench',
52
- 'frames',
53
- 'docmath',
54
- 'needle_haystack',
55
- 'bfcl_v3',
56
- 'hle',
57
- 'tau_bench',
20
+ 'iquiz',
21
+ 'ifeval',
22
+ 'mmlu',
23
+ 'mmlu_pro',
24
+ 'musr',
25
+ 'process_bench',
26
+ 'race',
27
+ 'trivia_qa',
28
+ 'cmmlu',
29
+ 'humaneval',
30
+ 'gsm8k',
31
+ 'bbh',
32
+ 'competition_math',
33
+ 'math_500',
34
+ 'aime24',
35
+ 'gpqa_diamond',
36
+ 'arc',
37
+ 'ceval',
38
+ 'hellaswag',
39
+ 'general_mcq',
40
+ 'general_qa',
41
+ 'super_gpqa',
42
+ # 'live_code_bench',
43
+ 'mmlu_redux',
44
+ 'simple_qa',
45
+ 'chinese_simpleqa',
46
+ 'alpaca_eval',
47
+ 'arena_hard',
48
+ 'maritime_bench',
49
+ 'drop',
50
+ 'winogrande',
51
+ 'tool_bench',
52
+ 'frames',
53
+ 'docmath',
54
+ 'needle_haystack',
55
+ 'bfcl_v3',
56
+ 'hle',
57
+ 'tau_bench',
58
58
  ]
59
59
 
60
60
  # Reverse the datasets list to ensure the order is from most recent to oldest
@@ -150,7 +150,6 @@ dataset_args={
150
150
  }
151
151
 
152
152
  class TestRun(unittest.TestCase):
153
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
154
153
  def test_benchmarks(self):
155
154
  from evalscope.config import TaskConfig
156
155
 
@@ -180,19 +179,60 @@ class TestRun(unittest.TestCase):
180
179
 
181
180
  run_task(task_cfg=task_cfg)
182
181
 
182
+ def test_vlm_benchmark(self):
183
+ from evalscope.config import TaskConfig
184
+
185
+ task_cfg = TaskConfig(
186
+ model='qwen-vl-plus',
187
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
188
+ api_key= env.get('DASHSCOPE_API_KEY'),
189
+ eval_type=EvalType.SERVICE,
190
+ datasets=[
191
+ 'mmmu',
192
+ # 'math_vista',
193
+ ],
194
+ dataset_args={
195
+ 'mmmu': {
196
+ 'subset_list': ['Accounting']
197
+ },
198
+ 'math_vista': {
199
+ 'subset_list': ['default']
200
+ }
201
+ },
202
+ eval_batch_size=1,
203
+ limit=1,
204
+ stream=True,
205
+ generation_config={
206
+ 'temperature': 0,
207
+ 'n': 1,
208
+ 'max_tokens': 4096,
209
+ 'image_height': 512,
210
+ 'image_width': 512,
211
+ 'image_num': 2,
212
+ },
213
+ judge_worker_num=5,
214
+ judge_strategy=JudgeStrategy.AUTO,
215
+ judge_model_args={
216
+ 'model_id': 'qwen2.5-72b-instruct',
217
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
218
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
219
+ }
220
+ )
221
+
222
+ run_task(task_cfg=task_cfg)
183
223
 
184
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
185
224
  def test_ci_lite(self):
186
225
  from evalscope.config import TaskConfig
187
226
 
227
+ api_key = env.get('DASHSCOPE_API_KEY')
228
+
188
229
  task_cfg = TaskConfig(
189
230
  model='qwen-plus',
190
231
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
191
- api_key= env.get('DASHSCOPE_API_KEY'),
192
- eval_type=EvalType.SERVICE,
232
+ api_key=api_key,
233
+ eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
193
234
  datasets=[
194
235
  'general_mcq',
195
- 'general_qa',
196
236
  'iquiz',
197
237
  ],
198
238
  dataset_args={
@@ -52,16 +52,19 @@ class TestCollection(unittest.TestCase):
52
52
  api_key=env.get('DASHSCOPE_API_KEY'),
53
53
  eval_type=EvalType.SERVICE,
54
54
  datasets=['data_collection'],
55
- dataset_args={'data_collection': {
56
- 'local_path': 'outputs/mixed_data_test.jsonl'
57
- # 'local_path': 'outputs/weighted_mixed_data.jsonl'
58
- }},
55
+ dataset_args={
56
+ 'data_collection': {
57
+ # 'local_path': 'outputs/test_mix.jsonl'
58
+ 'local_path': 'outputs/mixed_data_test.jsonl',
59
+ 'shuffle': True,
60
+ }
61
+ },
59
62
  eval_batch_size=5,
60
63
  generation_config = {
61
64
  'max_tokens': 10000,
62
65
  'temperature': 0.0,
63
66
  },
64
- limit=50,
67
+ limit=10,
65
68
  # use_cache='outputs/20250822_161804'
66
69
  )
67
70
  run_task(task_cfg=task_cfg)
@@ -0,0 +1,81 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+ from unittest import TestCase
8
+
9
+ from evalscope.config import TaskConfig
10
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
+ from evalscope.run import run_task
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ class TestReasoning(TestCase):
18
+ """Benchmark evaluation test cases."""
19
+
20
+ def setUp(self):
21
+ """Setup common test configuration."""
22
+ self.base_config = {
23
+ 'model': 'Qwen3-0.6B',
24
+ 'api_url': 'http://0.0.0.0:8801/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'eval_type': EvalType.SERVICE,
27
+ 'eval_batch_size': 5,
28
+ 'limit': 5,
29
+ 'generation_config': {
30
+ 'max_tokens': 4096,
31
+ 'temperature': 0.0,
32
+ 'seed': 42,
33
+ 'parallel_tool_calls': True,
34
+ 'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
35
+ },
36
+ 'judge_strategy': JudgeStrategy.AUTO,
37
+ 'judge_worker_num': 5,
38
+ 'judge_model_args': {
39
+ 'model_id': 'qwen2.5-72b-instruct',
40
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
41
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
42
+ 'generation_config': {
43
+ 'temperature': 0.0,
44
+ 'max_tokens': 4096,
45
+ }
46
+ },
47
+ 'debug': True,
48
+ }
49
+
50
+ def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
51
+ """Helper method to run test for a specific dataset."""
52
+ config = self.base_config.copy()
53
+ config['datasets'] = [dataset_name]
54
+
55
+ if use_mock:
56
+ config['eval_type'] = EvalType.MOCK_LLM
57
+
58
+ # 应用配置覆盖
59
+ config.update(config_overrides)
60
+
61
+ if dataset_args:
62
+ config['dataset_args'] = {dataset_name: dataset_args}
63
+
64
+ task_cfg = TaskConfig(**config)
65
+ run_task(task_cfg=task_cfg)
66
+
67
+ def _run_dataset_load_test(self, dataset_name, dataset_args=None):
68
+ """Helper method to test dataset loading."""
69
+
70
+ self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
71
+
72
+ # Math & Reasoning datasets
73
+ def test_gsm8k(self):
74
+ """Test GSM8K math reasoning dataset."""
75
+ self._run_dataset_test('gsm8k')
76
+
77
+
78
+ if __name__ == '__main__':
79
+ # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
80
+ # Run all tests: python -m unittest test_eval.TestBenchmark
81
+ unittest.main()
tests/common.py ADDED
@@ -0,0 +1,73 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+ from unittest import TestCase
8
+
9
+ from evalscope.config import TaskConfig
10
+ from evalscope.constants import EvalType, JudgeStrategy
11
+ from evalscope.run import run_task
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ class TestBenchmark(TestCase):
18
+ """Benchmark evaluation test cases."""
19
+
20
+ def setUp(self):
21
+ """Setup common test configuration."""
22
+ self.base_config = {
23
+ 'model': 'qwen-plus',
24
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'eval_type': EvalType.SERVICE,
27
+ 'eval_batch_size': 5,
28
+ 'limit': 5,
29
+ 'generation_config': {
30
+ 'max_tokens': 4096,
31
+ 'temperature': 0.0,
32
+ 'seed': 42,
33
+ 'parallel_tool_calls': True
34
+ },
35
+ 'judge_strategy': JudgeStrategy.AUTO,
36
+ 'judge_worker_num': 5,
37
+ 'judge_model_args': {
38
+ 'model_id': 'qwen2.5-72b-instruct',
39
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
41
+ 'generation_config': {
42
+ 'temperature': 0.0,
43
+ 'max_tokens': 4096,
44
+ }
45
+ },
46
+ 'debug': True,
47
+ }
48
+
49
+ def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
+ """Helper method to run test for a specific dataset."""
51
+ config = self.base_config.copy()
52
+ config['datasets'] = [dataset_name]
53
+
54
+ if not env.get('DASHSCOPE_API_KEY'):
55
+ use_mock = True
56
+ logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
57
+
58
+ if use_mock:
59
+ config['eval_type'] = EvalType.MOCK_LLM
60
+
61
+ # 应用配置覆盖
62
+ config.update(config_overrides)
63
+
64
+ if dataset_args:
65
+ config['dataset_args'] = {dataset_name: dataset_args}
66
+
67
+ task_cfg = TaskConfig(**config)
68
+ run_task(task_cfg=task_cfg)
69
+
70
+ def _run_dataset_load_test(self, dataset_name, dataset_args=None):
71
+ """Helper method to test dataset loading."""
72
+
73
+ self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
tests/perf/test_perf.py CHANGED
@@ -1,9 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from dotenv import dotenv_values
4
3
 
5
4
  env = dotenv_values('.env')
6
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
5
  import unittest
8
6
 
9
7
  from evalscope.perf.main import run_perf_benchmark
@@ -123,6 +121,10 @@ class TestPerf(unittest.TestCase):
123
121
 
124
122
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
125
123
  def test_run_perf_multi_parallel(self):
124
+ if not env.get('DASHSCOPE_API_KEY'):
125
+ self.skipTest('DASHSCOPE_API_KEY is not set.')
126
+ return
127
+
126
128
  from evalscope.perf.arguments import Arguments
127
129
  task_cfg = Arguments(
128
130
  parallel=[1, 2],
@@ -1,8 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
2
  import os
4
-
5
- # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
6
3
  import subprocess
7
4
  import unittest
8
5
 
@@ -1,105 +0,0 @@
1
- from abc import ABC
2
- from collections import defaultdict
3
- from typing import Any, Callable, Dict
4
-
5
- from evalscope.api.dataset import Dataset, DatasetDict, RemoteDataLoader
6
-
7
-
8
- class DatasetLoaderMixin:
9
- """
10
- Mixin class providing dataset loading functionality for benchmarks.
11
-
12
- This mixin provides common dataset loading methods that can be shared
13
- across different data adapters, including support for:
14
- - Loading multiple subsets
15
- - Few-shot dataset loading
16
- - Remote dataset loading with configuration
17
- """
18
-
19
- def load_subsets(self, load_func: Callable[[str], Dataset]) -> DatasetDict:
20
- """
21
- Load multiple subsets of the dataset using the provided loading function.
22
-
23
- This method handles two loading strategies:
24
- 1. Reformat mode: Load only the default subset and reformat it
25
- 2. Multi-subset mode: Load all subsets specified in subset_list
26
-
27
- Args:
28
- load_func (Callable[[str], Dataset]): Function to load individual subsets
29
-
30
- Returns:
31
- DatasetDict: Dictionary containing all loaded subsets
32
- """
33
- if self.reformat_subset:
34
- # Load only the default subset
35
- subset_data = load_func(self.default_subset)
36
- # Reformat the subset to create multiple subsets based on sample keys
37
- # NOTE: subset_list and limit is applied here if specified
38
- dataset_dict = DatasetDict.from_dataset(dataset=subset_data, subset_list=self.subset_list, limit=self.limit)
39
- else:
40
- # Load all specified subsets into separate entries
41
- subset_dict = defaultdict()
42
- for subset in self.subset_list:
43
- subset_data = load_func(subset)
44
- subset_dict[subset] = subset_data
45
- dataset_dict = DatasetDict(subset_dict)
46
- return dataset_dict
47
-
48
- def load_subset(self, subset: str) -> Dataset:
49
- """
50
- Load a specific subset of the dataset for evaluation.
51
-
52
- This method configures and executes the data loading for a single subset,
53
- handling both split-as-subset and traditional subset configurations.
54
-
55
- Args:
56
- subset (str): The subset identifier to load
57
-
58
- Returns:
59
- Dataset: The loaded dataset subset with processed samples
60
- """
61
- # Determine the split and subset names based on configuration
62
- split = subset if self.split_as_subset else self.eval_split
63
- subset_name = self.default_subset if self.split_as_subset else subset
64
-
65
- # Create and configure the remote data loader
66
- loader = RemoteDataLoader(
67
- data_id_or_path=self.dataset_id,
68
- split=split,
69
- subset=subset_name,
70
- sample_fields=self.record_to_sample, # Custom sample conversion function
71
- limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
72
- repeats=self._task_config.repeats, # Number of repetitions for each sample
73
- data_source=self._task_config.dataset_hub, # Data source configuration
74
- )
75
- return loader.load()
76
-
77
- def load_fewshot_subset(self, subset: str) -> Dataset:
78
- """
79
- Load a subset specifically for few-shot examples.
80
-
81
- This method loads training data to be used as demonstrations in few-shot prompting.
82
- It typically loads from the training split with limited samples and optional shuffling.
83
-
84
- Args:
85
- subset (str): The subset identifier to load few-shot examples from
86
-
87
- Returns:
88
- Dataset: The loaded few-shot dataset with demonstration examples
89
- """
90
- # Use training split for few-shot examples
91
- split = subset if self.split_as_subset else self.train_split
92
- subset_name = self.default_subset if self.split_as_subset else subset
93
-
94
- # Create loader specifically configured for few-shot sampling
95
- loader = RemoteDataLoader(
96
- data_id_or_path=self.dataset_id,
97
- split=split,
98
- subset=subset_name,
99
- sample_fields=self.record_to_sample,
100
- limit=self.few_shot_num
101
- if not self.reformat_subset else None, # Limit to specified number of few-shot examples
102
- shuffle=self.few_shot_random, # Randomize selection if enabled
103
- data_source=self._task_config.dataset_hub,
104
- )
105
- return loader.load()
@@ -1,44 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.utils.io_utils import jsonl_to_list
7
- from evalscope.utils.logger import get_logger
8
-
9
- logger = get_logger()
10
-
11
-
12
- class GeneralI2IAdapter:
13
-
14
- def __init__(self, **kwargs):
15
-
16
- super().__init__(**kwargs)
17
-
18
- def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
19
- dataset_name_or_path = dataset_name_or_path or self.dataset_id
20
- subset_list = subset_list or self.subset_list
21
-
22
- data_file_dict = defaultdict(str)
23
- data_item_dict = defaultdict(list)
24
-
25
- # get data file path and subset name
26
- if os.path.isdir(dataset_name_or_path):
27
- for subset_name in subset_list:
28
- data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
29
- elif os.path.isfile(dataset_name_or_path):
30
- cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
31
- data_file_dict[cur_subset_name] = dataset_name_or_path
32
- else:
33
- raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
34
-
35
- # load data from local disk
36
- try:
37
- for subset_name, file_path in data_file_dict.items():
38
- data_item_dict[subset_name] = jsonl_to_list(file_path)
39
- except Exception as e:
40
- raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
41
-
42
- data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
43
-
44
- return data_dict
tests/aigc/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
File without changes
File without changes