evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +40 -30
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  7. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  8. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  9. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  10. evalscope/backend/rag_eval/utils/embedding.py +77 -39
  11. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  12. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  13. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  14. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  16. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  17. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  18. evalscope/benchmarks/benchmark.py +2 -0
  19. evalscope/benchmarks/bfcl/__init__.py +0 -0
  20. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. evalscope/benchmarks/data_adapter.py +99 -16
  26. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope/benchmarks/docmath/__init__.py +0 -0
  28. evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  29. evalscope/benchmarks/docmath/utils.py +220 -0
  30. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  31. evalscope/benchmarks/frames/__init__.py +0 -0
  32. evalscope/benchmarks/frames/frames_adapter.py +91 -0
  33. evalscope/benchmarks/frames/utils.py +37 -0
  34. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  35. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  36. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  37. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  38. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  39. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  40. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  41. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  42. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  43. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  44. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  45. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  46. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  47. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  48. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  49. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  50. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  51. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  52. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  53. evalscope/benchmarks/race/race_adapter.py +3 -0
  54. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  55. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  56. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  57. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  58. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  59. evalscope/benchmarks/tool_bench/utils.py +5 -4
  60. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  61. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  62. evalscope/benchmarks/utils.py +25 -0
  63. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  64. evalscope/cli/start_app.py +2 -2
  65. evalscope/collections/__init__.py +35 -3
  66. evalscope/collections/evaluator.py +68 -34
  67. evalscope/config.py +8 -2
  68. evalscope/constants.py +1 -1
  69. evalscope/evaluator/evaluator.py +40 -28
  70. evalscope/metrics/__init__.py +3 -1
  71. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  72. evalscope/metrics/llm_judge.py +12 -5
  73. evalscope/metrics/math_parser.py +1 -1
  74. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  75. evalscope/models/adapters/__init__.py +2 -0
  76. evalscope/models/adapters/base_adapter.py +31 -27
  77. evalscope/models/adapters/bfcl_adapter.py +244 -0
  78. evalscope/models/adapters/server_adapter.py +80 -23
  79. evalscope/models/custom/custom_model.py +0 -3
  80. evalscope/models/custom/dummy_model.py +77 -39
  81. evalscope/models/local_model.py +1 -1
  82. evalscope/models/register.py +2 -1
  83. evalscope/perf/arguments.py +4 -2
  84. evalscope/perf/benchmark.py +16 -12
  85. evalscope/perf/main.py +7 -0
  86. evalscope/perf/plugin/api/openai_api.py +2 -0
  87. evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope/perf/utils/benchmark_util.py +1 -1
  89. evalscope/perf/utils/local_server.py +1 -0
  90. evalscope/perf/utils/log_utils.py +12 -5
  91. evalscope/perf/utils/rich_display.py +1 -1
  92. evalscope/report/__init__.py +36 -4
  93. evalscope/report/combinator.py +40 -6
  94. evalscope/report/generator.py +33 -9
  95. evalscope/report/utils.py +84 -4
  96. evalscope/run.py +12 -0
  97. evalscope/summarizer.py +1 -1
  98. evalscope/utils/io_utils.py +59 -2
  99. evalscope/utils/logger.py +1 -1
  100. evalscope/utils/utils.py +12 -0
  101. evalscope/version.py +2 -2
  102. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
  103. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
  104. tests/aigc/test_t2i.py +48 -11
  105. tests/cli/test_all.py +14 -3
  106. tests/cli/test_collection.py +6 -4
  107. tests/cli/test_run.py +50 -25
  108. tests/rag/test_clip_benchmark.py +5 -1
  109. tests/rag/test_mteb.py +51 -7
  110. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  111. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  112. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  113. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  114. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
tests/aigc/test_t2i.py CHANGED
@@ -11,7 +11,7 @@ from evalscope.run import run_task
11
11
  from evalscope.utils import test_level_list
12
12
  from evalscope.utils.logger import get_logger
13
13
 
14
- os.environ['LOG_LEVEL'] = 'DEBUG'
14
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
15
15
 
16
16
  logger = get_logger()
17
17
 
@@ -28,15 +28,15 @@ class TestRun(unittest.TestCase):
28
28
  dataset_args={
29
29
  'general_t2i': {
30
30
  'metric_list': [
31
- 'PickScore',
31
+ # 'PickScore',
32
32
  'CLIPScore',
33
- 'HPSv2Score',
34
- 'HPSv2.1Score',
35
- 'BLIPv2Score',
36
- 'ImageRewardScore',
37
- 'VQAScore',
38
- 'FGA_BLIP2Score',
39
- 'MPS'
33
+ # 'HPSv2Score',
34
+ # 'HPSv2.1Score',
35
+ # 'BLIPv2Score',
36
+ # 'ImageRewardScore',
37
+ # 'VQAScore',
38
+ # 'FGA_BLIP2Score',
39
+ # 'MPS'
40
40
  ],
41
41
  'dataset_id': 'custom_eval/multimodal/t2i/example.jsonl',
42
42
  }
@@ -58,9 +58,9 @@ class TestRun(unittest.TestCase):
58
58
  'torch_dtype': 'torch.float16',
59
59
  },
60
60
  datasets=[
61
- 'tifa160',
61
+ # 'tifa160',
62
62
  # 'genai_bench',
63
- # 'evalmuse',
63
+ 'evalmuse',
64
64
  # 'hpdv2',
65
65
  ],
66
66
  dataset_args={
@@ -85,3 +85,40 @@ class TestRun(unittest.TestCase):
85
85
  )
86
86
 
87
87
  run_task(task_cfg=task_cfg)
88
+
89
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
90
+ def test_run_benchmark_flux(self):
91
+
92
+ task_cfg = TaskConfig(
93
+ model='black-forest-labs/FLUX.1-dev', # model on modelscope
94
+ model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
95
+ model_args={
96
+ 'torch_dtype': 'torch.float16',
97
+ },
98
+ datasets=[
99
+ # 'tifa160',
100
+ # 'genai_bench',
101
+ 'evalmuse',
102
+ # 'hpdv2',
103
+ ],
104
+ dataset_args={
105
+ 'tifa160': {
106
+ 'metric_list': [
107
+ 'PickScore',
108
+ # 'CLIPScore',
109
+ # 'HPSv2Score',
110
+ # 'BLIPv2Score',
111
+ # 'ImageRewardScore',
112
+ # 'VQAScore',
113
+ # 'FGA_BLIP2Score',
114
+ ]
115
+ }
116
+ },
117
+ generation_config={
118
+ 'num_inference_steps': 50,
119
+ 'guidance_scale': 3.5
120
+ },
121
+ use_cache='outputs/20250520_112314'
122
+ )
123
+
124
+ run_task(task_cfg=task_cfg)
tests/cli/test_all.py CHANGED
@@ -12,7 +12,7 @@ from evalscope.run import run_task
12
12
  from evalscope.utils import test_level_list
13
13
  from evalscope.utils.logger import get_logger
14
14
 
15
- os.environ['LOG_LEVEL'] = 'DEBUG'
15
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
16
16
 
17
17
  logger = get_logger()
18
18
 
@@ -49,6 +49,10 @@ datasets=[
49
49
  'drop',
50
50
  'winogrande',
51
51
  'tool_bench',
52
+ 'frames',
53
+ 'docmath',
54
+ 'needle_haystack',
55
+ 'bfcl_v3',
52
56
  ]
53
57
 
54
58
  dataset_args={
@@ -123,6 +127,12 @@ dataset_args={
123
127
  'mmlu_redux':{
124
128
  'subset_list': ['abstract_algebra']
125
129
  },
130
+ 'frames':{
131
+ 'subset_list': ['simpshort_testmini']
132
+ },
133
+ 'bfcl_v3':{
134
+ 'subset_list': ['simple', 'multiple']
135
+ }
126
136
  }
127
137
 
128
138
  class TestRun(unittest.TestCase):
@@ -131,7 +141,7 @@ class TestRun(unittest.TestCase):
131
141
  from evalscope.config import TaskConfig
132
142
 
133
143
  task_cfg = TaskConfig(
134
- model='qwen2.5-0.5b-instruct',
144
+ model='qwen-plus',
135
145
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
136
146
  api_key= env.get('DASHSCOPE_API_KEY'),
137
147
  eval_type=EvalType.SERVICE,
@@ -145,9 +155,10 @@ class TestRun(unittest.TestCase):
145
155
  'n': 1,
146
156
  'max_tokens': 4096,
147
157
  },
158
+ judge_worker_num=5,
148
159
  judge_strategy=JudgeStrategy.AUTO,
149
160
  judge_model_args={
150
- 'model_id': 'qwen2.5-7b-instruct',
161
+ 'model_id': 'qwen2.5-72b-instruct',
151
162
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
152
163
  'api_key': env.get('DASHSCOPE_API_KEY'),
153
164
  }
@@ -72,14 +72,16 @@ class TestCollection(unittest.TestCase):
72
72
  'local_path': 'outputs/mixed_data_test.jsonl'
73
73
  # 'local_path': 'outputs/weighted_mixed_data.jsonl'
74
74
  }},
75
- limit=10,
76
- judge_strategy=JudgeStrategy.LLM_RECALL,
75
+ limit=5,
76
+ judge_strategy=JudgeStrategy.AUTO,
77
77
  judge_model_args={
78
- 'model_id': 'qwen2.5-7b-instruct',
78
+ 'model_id': 'qwen2.5-72b-instruct',
79
79
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
80
  'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
81
  },
82
- use_cache='outputs/20250519_114427'
82
+ analysis_report=True,
83
+ ignore_errors=True,
84
+ # use_cache='outputs/20250522_204520'
83
85
  )
84
86
  res = run_task(task_cfg=task_cfg)
85
87
  print(res)
tests/cli/test_run.py CHANGED
@@ -13,7 +13,7 @@ from evalscope.run import run_task
13
13
  from evalscope.utils import is_module_installed, test_level_list
14
14
  from evalscope.utils.logger import get_logger
15
15
 
16
- os.environ['LOG_LEVEL'] = 'DEBUG'
16
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
17
17
 
18
18
  logger = get_logger()
19
19
 
@@ -63,7 +63,7 @@ class TestRun(unittest.TestCase):
63
63
  f'--model {model} ' \
64
64
  f'--datasets {datasets} ' \
65
65
  f'--limit {limit} ' \
66
- f'--generation-config do_sample=false,temperature=0.0 ' \
66
+ f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
67
67
  f"""--dataset-args \'{dataset_args}\' """
68
68
 
69
69
  logger.info(f'Start to run command: {cmd_with_args}')
@@ -187,8 +187,11 @@ class TestRun(unittest.TestCase):
187
187
  from evalscope.config import TaskConfig
188
188
 
189
189
  task_cfg = TaskConfig(
190
- model='qwen/Qwen2-0.5B-Instruct',
191
- datasets=['general_mcq', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
190
+ model='Qwen/Qwen3-0.6B',
191
+ datasets=[
192
+ 'general_mcq',
193
+ 'general_qa'
194
+ ],
192
195
  dataset_args={
193
196
  'general_mcq': {
194
197
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
@@ -215,16 +218,14 @@ class TestRun(unittest.TestCase):
215
218
  task_cfg = TaskConfig(
216
219
  model='Qwen/Qwen3-1.7B',
217
220
  datasets=[
218
- 'iquiz',
221
+ # 'iquiz',
219
222
  # 'math_500',
220
223
  # 'aime24',
221
224
  # 'competition_math',
222
225
  # 'mmlu',
223
226
  # 'simple_qa',
227
+ 'truthful_qa',
224
228
  ],
225
- model_args={
226
- 'device_map': 'auto',
227
- },
228
229
  dataset_args={
229
230
  'competition_math': {
230
231
  'subset_list': ['Level 4', 'Level 5']
@@ -304,14 +305,16 @@ class TestRun(unittest.TestCase):
304
305
  # 'arc',
305
306
  # 'ceval',
306
307
  # 'hellaswag',
307
- # 'general_mcq',
308
+ 'general_mcq',
308
309
  # 'general_qa',
309
310
  # 'super_gpqa',
310
311
  # 'mmlu_redux',
311
312
  # 'maritime_bench',
312
313
  # 'drop',
313
314
  # 'winogrande',
314
- 'tool_bench',
315
+ # 'tool_bench',
316
+ # 'frames',
317
+ # 'bfcl_v3',
315
318
  ],
316
319
  dataset_args={
317
320
  'mmlu': {
@@ -369,24 +372,31 @@ class TestRun(unittest.TestCase):
369
372
  'metric_list': ['AverageRouge']
370
373
  },
371
374
  'super_gpqa': {
372
- # 'subset_list': ['Philosophy', 'Education'],
375
+ 'subset_list': ['Philosophy', 'Education'],
373
376
  'few_shot_num': 0
374
377
  },
375
378
  'mmlu_redux':{
376
379
  'subset_list': ['abstract_algebra']
377
380
  },
381
+ 'bfcl_v3': {
382
+ 'subset_list': ['parallel'],
383
+ 'extra_params': {
384
+ # 'is_fc_model': False,
385
+ }
386
+ },
378
387
  },
379
- eval_batch_size=32,
380
- limit=10,
388
+ eval_batch_size=10,
389
+ limit=5,
381
390
  debug=True,
382
- stream=False,
391
+ stream=True,
383
392
  generation_config={
384
393
  'temperature': 0,
385
394
  'n': 1,
386
395
  'max_tokens': 4096,
396
+ # 'extra_headers':{'key': 'value'},
387
397
  },
388
- # ignore_errors=True,
389
- use_cache='outputs/20250519_142106'
398
+ ignore_errors=False,
399
+ # use_cache='outputs/20250616_153756'
390
400
  )
391
401
 
392
402
  run_task(task_cfg=task_cfg)
@@ -427,26 +437,36 @@ class TestRun(unittest.TestCase):
427
437
  from evalscope.config import TaskConfig
428
438
 
429
439
  task_cfg = TaskConfig(
430
- model='qwen2.5-0.5b-instruct',
440
+ model='qwen-plus',
431
441
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
432
442
  api_key= env.get('DASHSCOPE_API_KEY'),
433
443
  eval_type=EvalType.SERVICE,
434
444
  datasets=[
435
- # 'math_500',
445
+ 'math_500',
436
446
  # 'aime24',
437
447
  # 'competition_math',
438
448
  # 'arc',
439
- # 'gsm8k'
449
+ # 'gsm8k',
440
450
  # 'truthful_qa',
441
451
  # 'simple_qa',
442
- 'chinese_simpleqa',
452
+ # 'chinese_simpleqa',
443
453
  # 'live_code_bench',
444
454
  # 'humaneval',
445
455
  # 'general_qa',
446
456
  # 'alpaca_eval',
447
- # 'arena_hard'
457
+ # 'arena_hard',
458
+ # 'frames',
459
+ # 'docmath',
460
+ # 'needle_haystack',
461
+ # 'ifeval',
448
462
  ],
449
463
  dataset_args={
464
+ 'needle_haystack': {
465
+ 'subset_list': ['english'],
466
+ 'extra_params': {
467
+ 'show_score': True,
468
+ }
469
+ },
450
470
  'competition_math': {
451
471
  'subset_list': ['Level 4']
452
472
  },
@@ -469,13 +489,16 @@ class TestRun(unittest.TestCase):
469
489
  '中华文化'
470
490
  ]
471
491
  },
492
+ 'frames': {
493
+ 'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
494
+ }
472
495
  },
473
496
  eval_batch_size=10,
474
- limit=10,
475
- judge_strategy=JudgeStrategy.AUTO,
497
+ limit=3,
498
+ judge_strategy=JudgeStrategy.LLM,
476
499
  judge_worker_num=5,
477
500
  judge_model_args={
478
- 'model_id': 'qwen2.5-7b-instruct',
501
+ 'model_id': 'qwen2.5-72b-instruct',
479
502
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
480
503
  'api_key': env.get('DASHSCOPE_API_KEY'),
481
504
  'generation_config': {
@@ -491,7 +514,9 @@ class TestRun(unittest.TestCase):
491
514
  },
492
515
  timeout=60000,
493
516
  stream=True,
494
- use_cache='outputs/20250519_142551'
517
+ # analysis_report=True,
518
+ # debug=True,
519
+ # use_cache='outputs/20250616_161931'
495
520
  )
496
521
 
497
522
  run_task(task_cfg=task_cfg)
@@ -39,7 +39,11 @@ class TestCLIPBenchmark(unittest.TestCase):
39
39
  'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
40
40
  }
41
41
  ],
42
- 'dataset_name': ['muge', 'mnist'],
42
+ 'dataset_name': [
43
+ 'muge',
44
+ 'mnist',
45
+ 'flickr8k'
46
+ ],
43
47
  'split': 'test',
44
48
  'batch_size': 128,
45
49
  'num_workers': 1,
tests/rag/test_mteb.py CHANGED
@@ -46,11 +46,11 @@ class TestMTEB(unittest.TestCase):
46
46
  ],
47
47
  'eval': {
48
48
  'tasks': [
49
- # 'TNews',
50
- # 'CLSClusteringS2S',
49
+ 'TNews',
50
+ 'CLSClusteringS2S',
51
51
  'T2Reranking',
52
- # 'T2Retrieval',
53
- # 'ATEC',
52
+ 'T2Retrieval',
53
+ 'ATEC',
54
54
  ],
55
55
  'verbosity': 2,
56
56
  'overwrite_results': True,
@@ -85,7 +85,7 @@ class TestMTEB(unittest.TestCase):
85
85
  ],
86
86
  'verbosity': 2,
87
87
  'overwrite_results': True,
88
- 'limits': 30,
88
+ 'limits': 10,
89
89
  },
90
90
  },
91
91
  )
@@ -121,10 +121,54 @@ class TestMTEB(unittest.TestCase):
121
121
  },
122
122
  ],
123
123
  'eval': {
124
- 'tasks': ['MedicalRetrieval', 'T2Retrieval'],
124
+ 'tasks': [
125
+ 'MedicalRetrieval',
126
+ 'T2Retrieval'
127
+ ],
128
+ 'verbosity': 2,
129
+ 'overwrite_results': True,
130
+ 'limits': 10,
131
+ 'top_k': 10,
132
+ },
133
+ },
134
+ }
135
+
136
+ run_task(task_cfg)
137
+
138
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
139
+ def test_run_two_stage_api(self):
140
+ task_cfg = {
141
+ 'eval_backend': 'RAGEval',
142
+ 'eval_config': {
143
+ 'tool': 'MTEB',
144
+ 'model': [
145
+ {
146
+ 'model_name': 'text-embedding-v3',
147
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
148
+ 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
149
+ 'dimensions': 1024,
150
+ 'encode_kwargs': {
151
+ 'batch_size': 10,
152
+ },
153
+ },
154
+ {
155
+ 'model_name': 'text-embedding-v3',
156
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
157
+ 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
158
+ 'dimensions': 1024,
159
+ 'encode_kwargs': {
160
+ 'batch_size': 10,
161
+ },
162
+ },
163
+ ],
164
+ 'eval': {
165
+ 'tasks': [
166
+ 'MedicalRetrieval',
167
+ # 'T2Retrieval'
168
+ ],
125
169
  'verbosity': 2,
126
170
  'overwrite_results': True,
127
- # 'limits': 10,
171
+ 'limits': 10,
128
172
  'top_k': 10,
129
173
  },
130
174
  },
File without changes