evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/app/ui/multi_model.py +6 -1
  17. evalscope/app/ui/single_model.py +8 -2
  18. evalscope/app/utils/data_utils.py +3 -2
  19. evalscope/app/utils/visualization.py +2 -2
  20. evalscope/arguments.py +6 -0
  21. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  22. evalscope/benchmarks/amc/__init__.py +0 -0
  23. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  24. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  25. evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
  26. evalscope/benchmarks/bfcl/generation.py +7 -7
  27. evalscope/benchmarks/blink/__init__.py +0 -0
  28. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  29. evalscope/benchmarks/chartqa/__init__.py +0 -0
  30. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  31. evalscope/benchmarks/chartqa/utils.py +38 -0
  32. evalscope/benchmarks/docvqa/__init__.py +0 -0
  33. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  34. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  35. evalscope/benchmarks/general_arena/utils.py +2 -1
  36. evalscope/benchmarks/healthbench/__init__.py +0 -0
  37. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  38. evalscope/benchmarks/healthbench/utils.py +102 -0
  39. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  40. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  41. evalscope/benchmarks/humaneval/utils.py +235 -0
  42. evalscope/benchmarks/infovqa/__init__.py +0 -0
  43. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  44. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  45. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  46. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  47. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  48. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  49. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  50. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  51. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  52. evalscope/benchmarks/mm_star/__init__.py +0 -0
  53. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  54. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  55. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  56. evalscope/benchmarks/multi_if/__init__.py +0 -0
  57. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  58. evalscope/benchmarks/multi_if/metrics.py +120 -0
  59. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  60. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  61. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  62. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  63. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  64. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  65. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  66. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  67. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  68. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  69. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  74. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  75. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  76. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  77. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  78. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  79. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  80. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  81. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  82. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  83. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  84. evalscope/config.py +24 -1
  85. evalscope/constants.py +3 -0
  86. evalscope/evaluator/evaluator.py +25 -7
  87. evalscope/metrics/metric.py +78 -2
  88. evalscope/metrics/metrics.py +16 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  102. evalscope/models/model_apis.py +10 -8
  103. evalscope/models/utils/openai.py +1 -2
  104. evalscope/perf/arguments.py +2 -0
  105. evalscope/perf/plugin/api/base.py +2 -2
  106. evalscope/perf/plugin/api/default_api.py +7 -7
  107. evalscope/perf/plugin/api/openai_api.py +83 -19
  108. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  109. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  110. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  111. evalscope/perf/utils/benchmark_util.py +1 -2
  112. evalscope/report/__init__.py +9 -1
  113. evalscope/report/combinator.py +45 -20
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +1 -1
  116. evalscope/utils/function_utils.py +41 -0
  117. evalscope/utils/import_utils.py +63 -13
  118. evalscope/utils/io_utils.py +19 -11
  119. evalscope/utils/json_schema.py +25 -2
  120. evalscope/utils/logger.py +19 -0
  121. evalscope/utils/model_utils.py +1 -1
  122. evalscope/utils/multi_choices.py +16 -1
  123. evalscope/version.py +2 -2
  124. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
  125. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
  126. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  127. tests/__init__.py +0 -1
  128. tests/benchmark/__init__.py +0 -1
  129. tests/benchmark/test_eval.py +0 -385
  130. tests/benchmark/test_image_edit.py +0 -65
  131. tests/benchmark/test_t2i.py +0 -142
  132. tests/benchmark/test_vlm.py +0 -80
  133. tests/cli/__init__.py +0 -1
  134. tests/cli/test_all.py +0 -269
  135. tests/cli/test_collection.py +0 -99
  136. tests/cli/test_custom.py +0 -268
  137. tests/cli/test_reasoning.py +0 -81
  138. tests/common.py +0 -73
  139. tests/perf/__init__.py +0 -1
  140. tests/perf/test_perf.py +0 -178
  141. tests/rag/test_clip_benchmark.py +0 -87
  142. tests/rag/test_mteb.py +0 -213
  143. tests/rag/test_ragas.py +0 -128
  144. tests/swift/__init__.py +0 -1
  145. tests/swift/test_run_swift_eval.py +0 -146
  146. tests/swift/test_run_swift_vlm_eval.py +0 -128
  147. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  148. tests/test_run_all.py +0 -12
  149. tests/utils.py +0 -13
  150. tests/vlm/__init__.py +0 -1
  151. tests/vlm/test_vlmeval.py +0 -102
  152. {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
  153. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  154. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  155. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,81 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- env = dotenv_values('.env')
5
-
6
- import unittest
7
- from unittest import TestCase
8
-
9
- from evalscope.config import TaskConfig
10
- from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
- from evalscope.run import run_task
12
- from evalscope.utils.logger import get_logger
13
-
14
- logger = get_logger()
15
-
16
-
17
- class TestReasoning(TestCase):
18
- """Benchmark evaluation test cases."""
19
-
20
- def setUp(self):
21
- """Setup common test configuration."""
22
- self.base_config = {
23
- 'model': 'Qwen3-0.6B',
24
- 'api_url': 'http://0.0.0.0:8801/v1',
25
- 'api_key': env.get('DASHSCOPE_API_KEY'),
26
- 'eval_type': EvalType.SERVICE,
27
- 'eval_batch_size': 5,
28
- 'limit': 5,
29
- 'generation_config': {
30
- 'max_tokens': 4096,
31
- 'temperature': 0.0,
32
- 'seed': 42,
33
- 'parallel_tool_calls': True,
34
- 'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
35
- },
36
- 'judge_strategy': JudgeStrategy.AUTO,
37
- 'judge_worker_num': 5,
38
- 'judge_model_args': {
39
- 'model_id': 'qwen2.5-72b-instruct',
40
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
41
- 'api_key': env.get('DASHSCOPE_API_KEY'),
42
- 'generation_config': {
43
- 'temperature': 0.0,
44
- 'max_tokens': 4096,
45
- }
46
- },
47
- 'debug': True,
48
- }
49
-
50
- def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
51
- """Helper method to run test for a specific dataset."""
52
- config = self.base_config.copy()
53
- config['datasets'] = [dataset_name]
54
-
55
- if use_mock:
56
- config['eval_type'] = EvalType.MOCK_LLM
57
-
58
- # 应用配置覆盖
59
- config.update(config_overrides)
60
-
61
- if dataset_args:
62
- config['dataset_args'] = {dataset_name: dataset_args}
63
-
64
- task_cfg = TaskConfig(**config)
65
- run_task(task_cfg=task_cfg)
66
-
67
- def _run_dataset_load_test(self, dataset_name, dataset_args=None):
68
- """Helper method to test dataset loading."""
69
-
70
- self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
71
-
72
- # Math & Reasoning datasets
73
- def test_gsm8k(self):
74
- """Test GSM8K math reasoning dataset."""
75
- self._run_dataset_test('gsm8k')
76
-
77
-
78
- if __name__ == '__main__':
79
- # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
80
- # Run all tests: python -m unittest test_eval.TestBenchmark
81
- unittest.main()
tests/common.py DELETED
@@ -1,73 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- env = dotenv_values('.env')
5
-
6
- import unittest
7
- from unittest import TestCase
8
-
9
- from evalscope.config import TaskConfig
10
- from evalscope.constants import EvalType, JudgeStrategy
11
- from evalscope.run import run_task
12
- from evalscope.utils.logger import get_logger
13
-
14
- logger = get_logger()
15
-
16
-
17
- class TestBenchmark(TestCase):
18
- """Benchmark evaluation test cases."""
19
-
20
- def setUp(self):
21
- """Setup common test configuration."""
22
- self.base_config = {
23
- 'model': 'qwen-plus',
24
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
- 'api_key': env.get('DASHSCOPE_API_KEY'),
26
- 'eval_type': EvalType.SERVICE,
27
- 'eval_batch_size': 5,
28
- 'limit': 5,
29
- 'generation_config': {
30
- 'max_tokens': 4096,
31
- 'temperature': 0.0,
32
- 'seed': 42,
33
- 'parallel_tool_calls': True
34
- },
35
- 'judge_strategy': JudgeStrategy.AUTO,
36
- 'judge_worker_num': 5,
37
- 'judge_model_args': {
38
- 'model_id': 'qwen2.5-72b-instruct',
39
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
- 'api_key': env.get('DASHSCOPE_API_KEY'),
41
- 'generation_config': {
42
- 'temperature': 0.0,
43
- 'max_tokens': 4096,
44
- }
45
- },
46
- 'debug': True,
47
- }
48
-
49
- def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
- """Helper method to run test for a specific dataset."""
51
- config = self.base_config.copy()
52
- config['datasets'] = [dataset_name]
53
-
54
- if not env.get('DASHSCOPE_API_KEY'):
55
- use_mock = True
56
- logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
57
-
58
- if use_mock:
59
- config['eval_type'] = EvalType.MOCK_LLM
60
-
61
- # 应用配置覆盖
62
- config.update(config_overrides)
63
-
64
- if dataset_args:
65
- config['dataset_args'] = {dataset_name: dataset_args}
66
-
67
- task_cfg = TaskConfig(**config)
68
- run_task(task_cfg=task_cfg)
69
-
70
- def _run_dataset_load_test(self, dataset_name, dataset_args=None):
71
- """Helper method to test dataset loading."""
72
-
73
- self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
tests/perf/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
tests/perf/test_perf.py DELETED
@@ -1,178 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- env = dotenv_values('.env')
5
- import unittest
6
-
7
- from evalscope.perf.main import run_perf_benchmark
8
- from tests.utils import test_level_list
9
-
10
-
11
- class TestPerf(unittest.TestCase):
12
-
13
- def setUp(self) -> None:
14
- pass
15
-
16
- def tearDown(self) -> None:
17
- pass
18
-
19
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
20
- def test_run_perf(self):
21
- task_cfg = {
22
- 'url': 'http://127.0.0.1:8001/v1/chat/completions',
23
- 'parallel': 1,
24
- 'model': 'qwen2.5',
25
- 'number': 15,
26
- 'api': 'openai',
27
- 'dataset': 'openqa',
28
- # 'stream': True,
29
- 'debug': True,
30
- }
31
- run_perf_benchmark(task_cfg)
32
-
33
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
34
- def test_run_perf_stream(self):
35
- task_cfg = {
36
- 'url': 'http://127.0.0.1:8801/v1/chat/completions',
37
- 'parallel': 1,
38
- 'model': 'Qwen2.5-0.5B-Instruct',
39
- 'number': 15,
40
- 'api': 'openai',
41
- 'dataset': 'openqa',
42
- 'stream': True,
43
- 'debug': True,
44
- }
45
- run_perf_benchmark(task_cfg)
46
-
47
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
48
- def test_run_perf_speed_benchmark(self):
49
- task_cfg = {
50
- 'url': 'http://127.0.0.1:8001/v1/completions',
51
- 'parallel': 1,
52
- 'model': 'qwen2.5',
53
- 'api': 'openai',
54
- 'dataset': 'speed_benchmark',
55
- 'min_tokens': 2048,
56
- 'max_tokens': 2048,
57
- 'debug': True,
58
- }
59
- run_perf_benchmark(task_cfg)
60
-
61
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
62
- def test_run_perf_local(self):
63
- task_cfg = {
64
- 'parallel': 1,
65
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
66
- 'number': 5,
67
- 'api': 'local',
68
- 'dataset': 'openqa',
69
- 'debug': True,
70
- }
71
- run_perf_benchmark(task_cfg)
72
-
73
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
74
- def test_run_perf_local_stream(self):
75
- task_cfg = {
76
- 'parallel': 1,
77
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
78
- 'number': 5,
79
- 'api': 'local',
80
- 'dataset': 'openqa',
81
- 'stream': True,
82
- 'debug': True,
83
- }
84
- run_perf_benchmark(task_cfg)
85
-
86
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
87
- def test_run_perf_local_speed_benchmark(self):
88
- task_cfg = {
89
- 'parallel': 1,
90
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
91
- 'api': 'local_vllm',
92
- 'dataset': 'speed_benchmark',
93
- 'min_tokens': 2048,
94
- 'max_tokens': 2048,
95
- 'debug': True,
96
- }
97
- run_perf_benchmark(task_cfg)
98
-
99
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
100
- def test_run_perf_local_random(self):
101
- from evalscope.perf.arguments import Arguments
102
- task_cfg = Arguments(
103
- parallel=20,
104
- model='Qwen3-1.7B',
105
- url='http://127.0.0.1:8801/v1/completions',
106
- api='openai',
107
- dataset='random',
108
- min_tokens=1024,
109
- max_tokens=1024,
110
- prefix_length=0,
111
- min_prompt_length=1024,
112
- max_prompt_length=1024,
113
- number=20,
114
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
115
- seed=None,
116
- extra_args={'ignore_eos': True}
117
- )
118
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
119
- print(metrics_result)
120
- print(percentile_result)
121
-
122
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
123
- def test_run_perf_multi_parallel(self):
124
- if not env.get('DASHSCOPE_API_KEY'):
125
- self.skipTest('DASHSCOPE_API_KEY is not set.')
126
- return
127
-
128
- from evalscope.perf.arguments import Arguments
129
- task_cfg = Arguments(
130
- parallel=[1, 2],
131
- number=[2, 4],
132
- model='qwen2.5-7b-instruct',
133
- url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
134
- api_key=env.get('DASHSCOPE_API_KEY'),
135
- api='openai',
136
- dataset='random',
137
- min_tokens=100,
138
- max_tokens=100,
139
- prefix_length=0,
140
- min_prompt_length=1024,
141
- max_prompt_length=1024,
142
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
143
- seed=None,
144
- extra_args={'ignore_eos': True}
145
- )
146
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
147
- print(metrics_result)
148
- print(percentile_result)
149
-
150
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
151
- def test_run_perf_random_vl(self):
152
- from evalscope.perf.arguments import Arguments
153
- task_cfg = Arguments(
154
- parallel=[1, 2],
155
- number=[2, 4],
156
- model='qwen-vl-max',
157
- url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
158
- api_key=env.get('DASHSCOPE_API_KEY'),
159
- api='openai',
160
- dataset='kontext_bench',
161
- min_tokens=100,
162
- max_tokens=100,
163
- prefix_length=0,
164
- min_prompt_length=100,
165
- max_prompt_length=100,
166
- image_height=512,
167
- image_width=512,
168
- image_num=2,
169
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
170
- seed=None,
171
- extra_args={'ignore_eos': True}
172
- )
173
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
174
- print(metrics_result)
175
- print(percentile_result)
176
-
177
- if __name__ == '__main__':
178
- unittest.main(buffer=False)
@@ -1,87 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
- import subprocess
4
- import unittest
5
-
6
- from evalscope.run import run_task
7
- from evalscope.utils.import_utils import is_module_installed
8
- from evalscope.utils.logger import get_logger
9
- from tests.utils import test_level_list
10
-
11
- logger = get_logger()
12
-
13
-
14
- class TestCLIPBenchmark(unittest.TestCase):
15
-
16
- def setUp(self) -> None:
17
- self._check_env('webdataset')
18
-
19
- def tearDown(self) -> None:
20
- pass
21
-
22
- @staticmethod
23
- def _check_env(module_name: str):
24
- if is_module_installed(module_name):
25
- logger.info(f'{module_name} is installed.')
26
- else:
27
- raise ModuleNotFoundError(f'run: pip install {module_name}')
28
-
29
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
30
- def test_run_task(self):
31
- task_cfg = {
32
- 'eval_backend': 'RAGEval',
33
- 'eval_config': {
34
- 'tool': 'clip_benchmark',
35
- 'eval': {
36
- 'models': [
37
- {
38
- 'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
39
- }
40
- ],
41
- 'dataset_name': [
42
- 'muge',
43
- 'mnist',
44
- 'flickr8k'
45
- ],
46
- 'split': 'test',
47
- 'batch_size': 128,
48
- 'num_workers': 1,
49
- 'verbose': True,
50
- 'skip_existing': False,
51
- 'cache_dir': 'cache',
52
- 'limit': 1000,
53
- },
54
- },
55
- }
56
-
57
- run_task(task_cfg)
58
-
59
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
60
- def test_run_custom(self):
61
- task_cfg = {
62
- 'eval_backend': 'RAGEval',
63
- 'eval_config': {
64
- 'tool': 'clip_benchmark',
65
- 'eval': {
66
- 'models': [
67
- {
68
- 'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
69
- }
70
- ],
71
- 'dataset_name': ['custom'],
72
- 'data_dir': 'custom_eval/multimodal/text-image-retrieval',
73
- 'split': 'test',
74
- 'batch_size': 128,
75
- 'num_workers': 1,
76
- 'verbose': True,
77
- 'skip_existing': False,
78
- 'limit': 1000,
79
- },
80
- },
81
- }
82
-
83
- run_task(task_cfg)
84
-
85
-
86
- if __name__ == '__main__':
87
- unittest.main(buffer=False)
tests/rag/test_mteb.py DELETED
@@ -1,213 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import unittest
4
- from dotenv import dotenv_values
5
-
6
- from tests.utils import test_level_list
7
-
8
- env = dotenv_values('.env')
9
- from evalscope.run import run_task
10
- from evalscope.utils.import_utils import is_module_installed
11
- from evalscope.utils.logger import get_logger
12
-
13
- logger = get_logger()
14
-
15
-
16
- class TestMTEB(unittest.TestCase):
17
-
18
- def setUp(self) -> None:
19
- self._check_env('mteb')
20
-
21
- def tearDown(self) -> None:
22
- pass
23
-
24
- @staticmethod
25
- def _check_env(module_name: str):
26
- if is_module_installed(module_name):
27
- logger.info(f'{module_name} is installed.')
28
- else:
29
- raise ModuleNotFoundError(f'run: pip install {module_name}')
30
-
31
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
32
- def test_run_one_stage_mteb(self):
33
- task_cfg = {
34
- 'eval_backend': 'RAGEval',
35
- 'eval_config': {
36
- 'tool': 'MTEB',
37
- 'model': [
38
- {
39
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
40
- 'pooling_mode': None, # load from model config
41
- 'max_seq_length': 512,
42
- 'prompt': '',
43
- 'model_kwargs': {'torch_dtype': 'auto'},
44
- 'encode_kwargs': {
45
- 'batch_size': 128,
46
- },
47
- }
48
- ],
49
- 'eval': {
50
- 'tasks': [
51
- 'TNews',
52
- 'CLSClusteringS2S',
53
- 'T2Reranking',
54
- 'T2Retrieval',
55
- 'ATEC',
56
- ],
57
- 'verbosity': 2,
58
- 'overwrite_results': True,
59
- 'limits': 500,
60
- },
61
- },
62
- }
63
-
64
- run_task(task_cfg)
65
-
66
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
67
- def test_run_one_stage_api(self):
68
- from evalscope import TaskConfig
69
- task_cfg = TaskConfig(
70
- eval_backend='RAGEval',
71
- eval_config={
72
- 'tool': 'MTEB',
73
- 'model': [
74
- {
75
- 'model_name': 'text-embedding-v3',
76
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
77
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
78
- 'dimensions': 1024,
79
- 'encode_kwargs': {
80
- 'batch_size': 10,
81
- },
82
- }
83
- ],
84
- 'eval': {
85
- 'tasks': [
86
- 'T2Retrieval',
87
- ],
88
- 'verbosity': 2,
89
- 'overwrite_results': True,
90
- 'limits': 10,
91
- },
92
- },
93
- )
94
-
95
- run_task(task_cfg)
96
-
97
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
98
- def test_run_two_stage_mteb(self):
99
- task_cfg = {
100
- 'eval_backend': 'RAGEval',
101
- 'eval_config': {
102
- 'tool': 'MTEB',
103
- 'model': [
104
- {
105
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
106
- 'is_cross_encoder': False,
107
- 'max_seq_length': 512,
108
- 'prompt': '',
109
- 'model_kwargs': {'torch_dtype': 'auto'},
110
- 'encode_kwargs': {
111
- 'batch_size': 64,
112
- },
113
- },
114
- {
115
- 'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
116
- 'is_cross_encoder': True,
117
- 'max_seq_length': 512,
118
- 'prompt': '为这个问题生成一个检索用的表示',
119
- 'model_kwargs': {'torch_dtype': 'auto'},
120
- 'encode_kwargs': {
121
- 'batch_size': 32,
122
- },
123
- },
124
- ],
125
- 'eval': {
126
- 'tasks': [
127
- 'MedicalRetrieval',
128
- 'T2Retrieval'
129
- ],
130
- 'verbosity': 2,
131
- 'overwrite_results': True,
132
- 'limits': 10,
133
- 'top_k': 10,
134
- },
135
- },
136
- }
137
-
138
- run_task(task_cfg)
139
-
140
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
141
- def test_run_two_stage_api(self):
142
- task_cfg = {
143
- 'eval_backend': 'RAGEval',
144
- 'eval_config': {
145
- 'tool': 'MTEB',
146
- 'model': [
147
- {
148
- 'model_name': 'text-embedding-v3',
149
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
150
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
151
- 'dimensions': 1024,
152
- 'encode_kwargs': {
153
- 'batch_size': 10,
154
- },
155
- },
156
- {
157
- 'model_name': 'text-embedding-v3',
158
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
159
- 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
160
- 'dimensions': 1024,
161
- 'encode_kwargs': {
162
- 'batch_size': 10,
163
- },
164
- },
165
- ],
166
- 'eval': {
167
- 'tasks': [
168
- 'MedicalRetrieval',
169
- # 'T2Retrieval'
170
- ],
171
- 'verbosity': 2,
172
- 'overwrite_results': True,
173
- 'limits': 10,
174
- 'top_k': 10,
175
- },
176
- },
177
- }
178
-
179
- run_task(task_cfg)
180
-
181
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
182
- def test_run_custom(self):
183
- task_cfg = {
184
- 'eval_backend': 'RAGEval',
185
- 'eval_config': {
186
- 'tool': 'MTEB',
187
- 'model': [
188
- {
189
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
190
- 'pooling_mode': None, # load from model config
191
- 'max_seq_length': 512,
192
- 'prompt': '',
193
- 'model_kwargs': {'torch_dtype': 'auto'},
194
- 'encode_kwargs': {
195
- 'batch_size': 128,
196
- },
197
- }
198
- ],
199
- 'eval': {
200
- 'tasks': ['CustomRetrieval'],
201
- 'dataset_path': 'custom_eval/text/retrieval',
202
- 'verbosity': 2,
203
- 'overwrite_results': True,
204
- 'limits': 500,
205
- },
206
- },
207
- }
208
-
209
- run_task(task_cfg)
210
-
211
-
212
- if __name__ == '__main__':
213
- unittest.main(buffer=False)