evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +40 -30
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +77 -39
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +2 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +99 -16
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +91 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/utils.py +25 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +68 -34
- evalscope/config.py +8 -2
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +40 -28
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +80 -23
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +4 -2
- evalscope/perf/benchmark.py +16 -12
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +40 -6
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +84 -4
- evalscope/run.py +12 -0
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
- tests/aigc/test_t2i.py +48 -11
- tests/cli/test_all.py +14 -3
- tests/cli/test_collection.py +6 -4
- tests/cli/test_run.py +50 -25
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +51 -7
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
tests/aigc/test_t2i.py
CHANGED
|
@@ -11,7 +11,7 @@ from evalscope.run import run_task
|
|
|
11
11
|
from evalscope.utils import test_level_list
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
13
13
|
|
|
14
|
-
os.environ['
|
|
14
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
15
15
|
|
|
16
16
|
logger = get_logger()
|
|
17
17
|
|
|
@@ -28,15 +28,15 @@ class TestRun(unittest.TestCase):
|
|
|
28
28
|
dataset_args={
|
|
29
29
|
'general_t2i': {
|
|
30
30
|
'metric_list': [
|
|
31
|
-
'PickScore',
|
|
31
|
+
# 'PickScore',
|
|
32
32
|
'CLIPScore',
|
|
33
|
-
'HPSv2Score',
|
|
34
|
-
'HPSv2.1Score',
|
|
35
|
-
'BLIPv2Score',
|
|
36
|
-
'ImageRewardScore',
|
|
37
|
-
'VQAScore',
|
|
38
|
-
'FGA_BLIP2Score',
|
|
39
|
-
'MPS'
|
|
33
|
+
# 'HPSv2Score',
|
|
34
|
+
# 'HPSv2.1Score',
|
|
35
|
+
# 'BLIPv2Score',
|
|
36
|
+
# 'ImageRewardScore',
|
|
37
|
+
# 'VQAScore',
|
|
38
|
+
# 'FGA_BLIP2Score',
|
|
39
|
+
# 'MPS'
|
|
40
40
|
],
|
|
41
41
|
'dataset_id': 'custom_eval/multimodal/t2i/example.jsonl',
|
|
42
42
|
}
|
|
@@ -58,9 +58,9 @@ class TestRun(unittest.TestCase):
|
|
|
58
58
|
'torch_dtype': 'torch.float16',
|
|
59
59
|
},
|
|
60
60
|
datasets=[
|
|
61
|
-
'tifa160',
|
|
61
|
+
# 'tifa160',
|
|
62
62
|
# 'genai_bench',
|
|
63
|
-
|
|
63
|
+
'evalmuse',
|
|
64
64
|
# 'hpdv2',
|
|
65
65
|
],
|
|
66
66
|
dataset_args={
|
|
@@ -85,3 +85,40 @@ class TestRun(unittest.TestCase):
|
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
run_task(task_cfg=task_cfg)
|
|
88
|
+
|
|
89
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
90
|
+
def test_run_benchmark_flux(self):
|
|
91
|
+
|
|
92
|
+
task_cfg = TaskConfig(
|
|
93
|
+
model='black-forest-labs/FLUX.1-dev', # model on modelscope
|
|
94
|
+
model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
|
|
95
|
+
model_args={
|
|
96
|
+
'torch_dtype': 'torch.float16',
|
|
97
|
+
},
|
|
98
|
+
datasets=[
|
|
99
|
+
# 'tifa160',
|
|
100
|
+
# 'genai_bench',
|
|
101
|
+
'evalmuse',
|
|
102
|
+
# 'hpdv2',
|
|
103
|
+
],
|
|
104
|
+
dataset_args={
|
|
105
|
+
'tifa160': {
|
|
106
|
+
'metric_list': [
|
|
107
|
+
'PickScore',
|
|
108
|
+
# 'CLIPScore',
|
|
109
|
+
# 'HPSv2Score',
|
|
110
|
+
# 'BLIPv2Score',
|
|
111
|
+
# 'ImageRewardScore',
|
|
112
|
+
# 'VQAScore',
|
|
113
|
+
# 'FGA_BLIP2Score',
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
},
|
|
117
|
+
generation_config={
|
|
118
|
+
'num_inference_steps': 50,
|
|
119
|
+
'guidance_scale': 3.5
|
|
120
|
+
},
|
|
121
|
+
use_cache='outputs/20250520_112314'
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_all.py
CHANGED
|
@@ -12,7 +12,7 @@ from evalscope.run import run_task
|
|
|
12
12
|
from evalscope.utils import test_level_list
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
14
14
|
|
|
15
|
-
os.environ['
|
|
15
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
16
16
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
@@ -49,6 +49,10 @@ datasets=[
|
|
|
49
49
|
'drop',
|
|
50
50
|
'winogrande',
|
|
51
51
|
'tool_bench',
|
|
52
|
+
'frames',
|
|
53
|
+
'docmath',
|
|
54
|
+
'needle_haystack',
|
|
55
|
+
'bfcl_v3',
|
|
52
56
|
]
|
|
53
57
|
|
|
54
58
|
dataset_args={
|
|
@@ -123,6 +127,12 @@ dataset_args={
|
|
|
123
127
|
'mmlu_redux':{
|
|
124
128
|
'subset_list': ['abstract_algebra']
|
|
125
129
|
},
|
|
130
|
+
'frames':{
|
|
131
|
+
'subset_list': ['simpshort_testmini']
|
|
132
|
+
},
|
|
133
|
+
'bfcl_v3':{
|
|
134
|
+
'subset_list': ['simple', 'multiple']
|
|
135
|
+
}
|
|
126
136
|
}
|
|
127
137
|
|
|
128
138
|
class TestRun(unittest.TestCase):
|
|
@@ -131,7 +141,7 @@ class TestRun(unittest.TestCase):
|
|
|
131
141
|
from evalscope.config import TaskConfig
|
|
132
142
|
|
|
133
143
|
task_cfg = TaskConfig(
|
|
134
|
-
model='
|
|
144
|
+
model='qwen-plus',
|
|
135
145
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
136
146
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
137
147
|
eval_type=EvalType.SERVICE,
|
|
@@ -145,9 +155,10 @@ class TestRun(unittest.TestCase):
|
|
|
145
155
|
'n': 1,
|
|
146
156
|
'max_tokens': 4096,
|
|
147
157
|
},
|
|
158
|
+
judge_worker_num=5,
|
|
148
159
|
judge_strategy=JudgeStrategy.AUTO,
|
|
149
160
|
judge_model_args={
|
|
150
|
-
'model_id': 'qwen2.5-
|
|
161
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
151
162
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
152
163
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
153
164
|
}
|
tests/cli/test_collection.py
CHANGED
|
@@ -72,14 +72,16 @@ class TestCollection(unittest.TestCase):
|
|
|
72
72
|
'local_path': 'outputs/mixed_data_test.jsonl'
|
|
73
73
|
# 'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
74
74
|
}},
|
|
75
|
-
limit=
|
|
76
|
-
judge_strategy=JudgeStrategy.
|
|
75
|
+
limit=5,
|
|
76
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
77
77
|
judge_model_args={
|
|
78
|
-
'model_id': 'qwen2.5-
|
|
78
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
79
79
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
80
|
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
81
|
},
|
|
82
|
-
|
|
82
|
+
analysis_report=True,
|
|
83
|
+
ignore_errors=True,
|
|
84
|
+
# use_cache='outputs/20250522_204520'
|
|
83
85
|
)
|
|
84
86
|
res = run_task(task_cfg=task_cfg)
|
|
85
87
|
print(res)
|
tests/cli/test_run.py
CHANGED
|
@@ -13,7 +13,7 @@ from evalscope.run import run_task
|
|
|
13
13
|
from evalscope.utils import is_module_installed, test_level_list
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
15
|
|
|
16
|
-
os.environ['
|
|
16
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
17
17
|
|
|
18
18
|
logger = get_logger()
|
|
19
19
|
|
|
@@ -63,7 +63,7 @@ class TestRun(unittest.TestCase):
|
|
|
63
63
|
f'--model {model} ' \
|
|
64
64
|
f'--datasets {datasets} ' \
|
|
65
65
|
f'--limit {limit} ' \
|
|
66
|
-
f'--generation-config do_sample=
|
|
66
|
+
f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
|
|
67
67
|
f"""--dataset-args \'{dataset_args}\' """
|
|
68
68
|
|
|
69
69
|
logger.info(f'Start to run command: {cmd_with_args}')
|
|
@@ -187,8 +187,11 @@ class TestRun(unittest.TestCase):
|
|
|
187
187
|
from evalscope.config import TaskConfig
|
|
188
188
|
|
|
189
189
|
task_cfg = TaskConfig(
|
|
190
|
-
model='
|
|
191
|
-
datasets=[
|
|
190
|
+
model='Qwen/Qwen3-0.6B',
|
|
191
|
+
datasets=[
|
|
192
|
+
'general_mcq',
|
|
193
|
+
'general_qa'
|
|
194
|
+
],
|
|
192
195
|
dataset_args={
|
|
193
196
|
'general_mcq': {
|
|
194
197
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
@@ -215,16 +218,14 @@ class TestRun(unittest.TestCase):
|
|
|
215
218
|
task_cfg = TaskConfig(
|
|
216
219
|
model='Qwen/Qwen3-1.7B',
|
|
217
220
|
datasets=[
|
|
218
|
-
'iquiz',
|
|
221
|
+
# 'iquiz',
|
|
219
222
|
# 'math_500',
|
|
220
223
|
# 'aime24',
|
|
221
224
|
# 'competition_math',
|
|
222
225
|
# 'mmlu',
|
|
223
226
|
# 'simple_qa',
|
|
227
|
+
'truthful_qa',
|
|
224
228
|
],
|
|
225
|
-
model_args={
|
|
226
|
-
'device_map': 'auto',
|
|
227
|
-
},
|
|
228
229
|
dataset_args={
|
|
229
230
|
'competition_math': {
|
|
230
231
|
'subset_list': ['Level 4', 'Level 5']
|
|
@@ -304,14 +305,16 @@ class TestRun(unittest.TestCase):
|
|
|
304
305
|
# 'arc',
|
|
305
306
|
# 'ceval',
|
|
306
307
|
# 'hellaswag',
|
|
307
|
-
|
|
308
|
+
'general_mcq',
|
|
308
309
|
# 'general_qa',
|
|
309
310
|
# 'super_gpqa',
|
|
310
311
|
# 'mmlu_redux',
|
|
311
312
|
# 'maritime_bench',
|
|
312
313
|
# 'drop',
|
|
313
314
|
# 'winogrande',
|
|
314
|
-
'tool_bench',
|
|
315
|
+
# 'tool_bench',
|
|
316
|
+
# 'frames',
|
|
317
|
+
# 'bfcl_v3',
|
|
315
318
|
],
|
|
316
319
|
dataset_args={
|
|
317
320
|
'mmlu': {
|
|
@@ -369,24 +372,31 @@ class TestRun(unittest.TestCase):
|
|
|
369
372
|
'metric_list': ['AverageRouge']
|
|
370
373
|
},
|
|
371
374
|
'super_gpqa': {
|
|
372
|
-
|
|
375
|
+
'subset_list': ['Philosophy', 'Education'],
|
|
373
376
|
'few_shot_num': 0
|
|
374
377
|
},
|
|
375
378
|
'mmlu_redux':{
|
|
376
379
|
'subset_list': ['abstract_algebra']
|
|
377
380
|
},
|
|
381
|
+
'bfcl_v3': {
|
|
382
|
+
'subset_list': ['parallel'],
|
|
383
|
+
'extra_params': {
|
|
384
|
+
# 'is_fc_model': False,
|
|
385
|
+
}
|
|
386
|
+
},
|
|
378
387
|
},
|
|
379
|
-
eval_batch_size=
|
|
380
|
-
limit=
|
|
388
|
+
eval_batch_size=10,
|
|
389
|
+
limit=5,
|
|
381
390
|
debug=True,
|
|
382
|
-
stream=
|
|
391
|
+
stream=True,
|
|
383
392
|
generation_config={
|
|
384
393
|
'temperature': 0,
|
|
385
394
|
'n': 1,
|
|
386
395
|
'max_tokens': 4096,
|
|
396
|
+
# 'extra_headers':{'key': 'value'},
|
|
387
397
|
},
|
|
388
|
-
|
|
389
|
-
use_cache='outputs/
|
|
398
|
+
ignore_errors=False,
|
|
399
|
+
# use_cache='outputs/20250616_153756'
|
|
390
400
|
)
|
|
391
401
|
|
|
392
402
|
run_task(task_cfg=task_cfg)
|
|
@@ -427,26 +437,36 @@ class TestRun(unittest.TestCase):
|
|
|
427
437
|
from evalscope.config import TaskConfig
|
|
428
438
|
|
|
429
439
|
task_cfg = TaskConfig(
|
|
430
|
-
model='
|
|
440
|
+
model='qwen-plus',
|
|
431
441
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
432
442
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
433
443
|
eval_type=EvalType.SERVICE,
|
|
434
444
|
datasets=[
|
|
435
|
-
|
|
445
|
+
'math_500',
|
|
436
446
|
# 'aime24',
|
|
437
447
|
# 'competition_math',
|
|
438
448
|
# 'arc',
|
|
439
|
-
# 'gsm8k'
|
|
449
|
+
# 'gsm8k',
|
|
440
450
|
# 'truthful_qa',
|
|
441
451
|
# 'simple_qa',
|
|
442
|
-
'chinese_simpleqa',
|
|
452
|
+
# 'chinese_simpleqa',
|
|
443
453
|
# 'live_code_bench',
|
|
444
454
|
# 'humaneval',
|
|
445
455
|
# 'general_qa',
|
|
446
456
|
# 'alpaca_eval',
|
|
447
|
-
# 'arena_hard'
|
|
457
|
+
# 'arena_hard',
|
|
458
|
+
# 'frames',
|
|
459
|
+
# 'docmath',
|
|
460
|
+
# 'needle_haystack',
|
|
461
|
+
# 'ifeval',
|
|
448
462
|
],
|
|
449
463
|
dataset_args={
|
|
464
|
+
'needle_haystack': {
|
|
465
|
+
'subset_list': ['english'],
|
|
466
|
+
'extra_params': {
|
|
467
|
+
'show_score': True,
|
|
468
|
+
}
|
|
469
|
+
},
|
|
450
470
|
'competition_math': {
|
|
451
471
|
'subset_list': ['Level 4']
|
|
452
472
|
},
|
|
@@ -469,13 +489,16 @@ class TestRun(unittest.TestCase):
|
|
|
469
489
|
'中华文化'
|
|
470
490
|
]
|
|
471
491
|
},
|
|
492
|
+
'frames': {
|
|
493
|
+
'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
|
|
494
|
+
}
|
|
472
495
|
},
|
|
473
496
|
eval_batch_size=10,
|
|
474
|
-
limit=
|
|
475
|
-
judge_strategy=JudgeStrategy.
|
|
497
|
+
limit=3,
|
|
498
|
+
judge_strategy=JudgeStrategy.LLM,
|
|
476
499
|
judge_worker_num=5,
|
|
477
500
|
judge_model_args={
|
|
478
|
-
'model_id': 'qwen2.5-
|
|
501
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
479
502
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
480
503
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
481
504
|
'generation_config': {
|
|
@@ -491,7 +514,9 @@ class TestRun(unittest.TestCase):
|
|
|
491
514
|
},
|
|
492
515
|
timeout=60000,
|
|
493
516
|
stream=True,
|
|
494
|
-
|
|
517
|
+
# analysis_report=True,
|
|
518
|
+
# debug=True,
|
|
519
|
+
# use_cache='outputs/20250616_161931'
|
|
495
520
|
)
|
|
496
521
|
|
|
497
522
|
run_task(task_cfg=task_cfg)
|
tests/rag/test_clip_benchmark.py
CHANGED
|
@@ -39,7 +39,11 @@ class TestCLIPBenchmark(unittest.TestCase):
|
|
|
39
39
|
'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
|
|
40
40
|
}
|
|
41
41
|
],
|
|
42
|
-
'dataset_name': [
|
|
42
|
+
'dataset_name': [
|
|
43
|
+
'muge',
|
|
44
|
+
'mnist',
|
|
45
|
+
'flickr8k'
|
|
46
|
+
],
|
|
43
47
|
'split': 'test',
|
|
44
48
|
'batch_size': 128,
|
|
45
49
|
'num_workers': 1,
|
tests/rag/test_mteb.py
CHANGED
|
@@ -46,11 +46,11 @@ class TestMTEB(unittest.TestCase):
|
|
|
46
46
|
],
|
|
47
47
|
'eval': {
|
|
48
48
|
'tasks': [
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
'TNews',
|
|
50
|
+
'CLSClusteringS2S',
|
|
51
51
|
'T2Reranking',
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
'T2Retrieval',
|
|
53
|
+
'ATEC',
|
|
54
54
|
],
|
|
55
55
|
'verbosity': 2,
|
|
56
56
|
'overwrite_results': True,
|
|
@@ -85,7 +85,7 @@ class TestMTEB(unittest.TestCase):
|
|
|
85
85
|
],
|
|
86
86
|
'verbosity': 2,
|
|
87
87
|
'overwrite_results': True,
|
|
88
|
-
'limits':
|
|
88
|
+
'limits': 10,
|
|
89
89
|
},
|
|
90
90
|
},
|
|
91
91
|
)
|
|
@@ -121,10 +121,54 @@ class TestMTEB(unittest.TestCase):
|
|
|
121
121
|
},
|
|
122
122
|
],
|
|
123
123
|
'eval': {
|
|
124
|
-
'tasks': [
|
|
124
|
+
'tasks': [
|
|
125
|
+
'MedicalRetrieval',
|
|
126
|
+
'T2Retrieval'
|
|
127
|
+
],
|
|
128
|
+
'verbosity': 2,
|
|
129
|
+
'overwrite_results': True,
|
|
130
|
+
'limits': 10,
|
|
131
|
+
'top_k': 10,
|
|
132
|
+
},
|
|
133
|
+
},
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
run_task(task_cfg)
|
|
137
|
+
|
|
138
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
139
|
+
def test_run_two_stage_api(self):
|
|
140
|
+
task_cfg = {
|
|
141
|
+
'eval_backend': 'RAGEval',
|
|
142
|
+
'eval_config': {
|
|
143
|
+
'tool': 'MTEB',
|
|
144
|
+
'model': [
|
|
145
|
+
{
|
|
146
|
+
'model_name': 'text-embedding-v3',
|
|
147
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
148
|
+
'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
|
|
149
|
+
'dimensions': 1024,
|
|
150
|
+
'encode_kwargs': {
|
|
151
|
+
'batch_size': 10,
|
|
152
|
+
},
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
'model_name': 'text-embedding-v3',
|
|
156
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
157
|
+
'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
|
|
158
|
+
'dimensions': 1024,
|
|
159
|
+
'encode_kwargs': {
|
|
160
|
+
'batch_size': 10,
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
],
|
|
164
|
+
'eval': {
|
|
165
|
+
'tasks': [
|
|
166
|
+
'MedicalRetrieval',
|
|
167
|
+
# 'T2Retrieval'
|
|
168
|
+
],
|
|
125
169
|
'verbosity': 2,
|
|
126
170
|
'overwrite_results': True,
|
|
127
|
-
|
|
171
|
+
'limits': 10,
|
|
128
172
|
'top_k': 10,
|
|
129
173
|
},
|
|
130
174
|
},
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|