evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show
  1. evalscope/app/app.py +20 -5
  2. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  3. evalscope/backend/rag_eval/utils/embedding.py +2 -4
  4. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  5. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  6. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  7. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  8. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  9. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  10. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  11. evalscope/benchmarks/benchmark.py +1 -0
  12. evalscope/benchmarks/bfcl/__init__.py +0 -0
  13. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  14. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  15. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  16. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  17. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  18. evalscope/benchmarks/data_adapter.py +2 -0
  19. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  20. evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
  21. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +1 -0
  23. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  26. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  27. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  29. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  30. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  32. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  34. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  35. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  36. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  37. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  38. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
  39. evalscope/benchmarks/needle_haystack/utils.py +2 -2
  40. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  41. evalscope/benchmarks/race/race_adapter.py +3 -0
  42. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  43. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  44. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  45. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  46. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  48. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  49. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  50. evalscope/collections/evaluator.py +50 -28
  51. evalscope/constants.py +1 -1
  52. evalscope/evaluator/evaluator.py +6 -5
  53. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  54. evalscope/models/adapters/__init__.py +2 -0
  55. evalscope/models/adapters/base_adapter.py +31 -27
  56. evalscope/models/adapters/bfcl_adapter.py +244 -0
  57. evalscope/models/adapters/server_adapter.py +78 -17
  58. evalscope/models/custom/custom_model.py +0 -3
  59. evalscope/models/custom/dummy_model.py +77 -39
  60. evalscope/models/local_model.py +1 -1
  61. evalscope/models/register.py +2 -1
  62. evalscope/perf/arguments.py +2 -0
  63. evalscope/perf/benchmark.py +16 -3
  64. evalscope/perf/plugin/api/openai_api.py +2 -0
  65. evalscope/report/combinator.py +38 -12
  66. evalscope/report/utils.py +24 -1
  67. evalscope/run.py +1 -1
  68. evalscope/summarizer.py +1 -1
  69. evalscope/utils/io_utils.py +59 -2
  70. evalscope/version.py +2 -2
  71. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
  72. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
  73. tests/aigc/test_t2i.py +8 -8
  74. tests/cli/test_all.py +40 -33
  75. tests/cli/test_collection.py +4 -3
  76. tests/cli/test_run.py +36 -21
  77. tests/rag/test_clip_benchmark.py +5 -1
  78. tests/rag/test_mteb.py +46 -2
  79. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  80. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  81. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  82. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py CHANGED
@@ -63,7 +63,7 @@ class TestRun(unittest.TestCase):
63
63
  f'--model {model} ' \
64
64
  f'--datasets {datasets} ' \
65
65
  f'--limit {limit} ' \
66
- f'--generation-config do_sample=false,temperature=0.0 ' \
66
+ f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
67
67
  f"""--dataset-args \'{dataset_args}\' """
68
68
 
69
69
  logger.info(f'Start to run command: {cmd_with_args}')
@@ -187,8 +187,11 @@ class TestRun(unittest.TestCase):
187
187
  from evalscope.config import TaskConfig
188
188
 
189
189
  task_cfg = TaskConfig(
190
- model='qwen/Qwen2-0.5B-Instruct',
191
- datasets=['general_mcq', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
190
+ model='Qwen/Qwen3-0.6B',
191
+ datasets=[
192
+ 'general_mcq',
193
+ 'general_qa'
194
+ ],
192
195
  dataset_args={
193
196
  'general_mcq': {
194
197
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
@@ -215,16 +218,14 @@ class TestRun(unittest.TestCase):
215
218
  task_cfg = TaskConfig(
216
219
  model='Qwen/Qwen3-1.7B',
217
220
  datasets=[
218
- 'iquiz',
221
+ # 'iquiz',
219
222
  # 'math_500',
220
223
  # 'aime24',
221
224
  # 'competition_math',
222
225
  # 'mmlu',
223
226
  # 'simple_qa',
227
+ 'truthful_qa',
224
228
  ],
225
- model_args={
226
- 'device_map': 'auto',
227
- },
228
229
  dataset_args={
229
230
  'competition_math': {
230
231
  'subset_list': ['Level 4', 'Level 5']
@@ -304,7 +305,7 @@ class TestRun(unittest.TestCase):
304
305
  # 'arc',
305
306
  # 'ceval',
306
307
  # 'hellaswag',
307
- # 'general_mcq',
308
+ 'general_mcq',
308
309
  # 'general_qa',
309
310
  # 'super_gpqa',
310
311
  # 'mmlu_redux',
@@ -312,7 +313,8 @@ class TestRun(unittest.TestCase):
312
313
  # 'drop',
313
314
  # 'winogrande',
314
315
  # 'tool_bench',
315
- 'frames',
316
+ # 'frames',
317
+ # 'bfcl_v3',
316
318
  ],
317
319
  dataset_args={
318
320
  'mmlu': {
@@ -370,25 +372,31 @@ class TestRun(unittest.TestCase):
370
372
  'metric_list': ['AverageRouge']
371
373
  },
372
374
  'super_gpqa': {
373
- # 'subset_list': ['Philosophy', 'Education'],
375
+ 'subset_list': ['Philosophy', 'Education'],
374
376
  'few_shot_num': 0
375
377
  },
376
378
  'mmlu_redux':{
377
379
  'subset_list': ['abstract_algebra']
378
380
  },
381
+ 'bfcl_v3': {
382
+ 'subset_list': ['parallel'],
383
+ 'extra_params': {
384
+ # 'is_fc_model': False,
385
+ }
386
+ },
379
387
  },
380
- eval_batch_size=32,
381
- limit=10,
388
+ eval_batch_size=10,
389
+ limit=5,
382
390
  debug=True,
383
- stream=False,
391
+ stream=True,
384
392
  generation_config={
385
393
  'temperature': 0,
386
394
  'n': 1,
387
395
  'max_tokens': 4096,
388
396
  # 'extra_headers':{'key': 'value'},
389
397
  },
390
- # ignore_errors=True,
391
- # use_cache='outputs/20250519_142106'
398
+ ignore_errors=False,
399
+ # use_cache='outputs/20250616_153756'
392
400
  )
393
401
 
394
402
  run_task(task_cfg=task_cfg)
@@ -434,8 +442,8 @@ class TestRun(unittest.TestCase):
434
442
  api_key= env.get('DASHSCOPE_API_KEY'),
435
443
  eval_type=EvalType.SERVICE,
436
444
  datasets=[
437
- # 'math_500',
438
- 'aime24',
445
+ 'math_500',
446
+ # 'aime24',
439
447
  # 'competition_math',
440
448
  # 'arc',
441
449
  # 'gsm8k',
@@ -450,8 +458,15 @@ class TestRun(unittest.TestCase):
450
458
  # 'frames',
451
459
  # 'docmath',
452
460
  # 'needle_haystack',
461
+ # 'ifeval',
453
462
  ],
454
463
  dataset_args={
464
+ 'needle_haystack': {
465
+ 'subset_list': ['english'],
466
+ 'extra_params': {
467
+ 'show_score': True,
468
+ }
469
+ },
455
470
  'competition_math': {
456
471
  'subset_list': ['Level 4']
457
472
  },
@@ -479,8 +494,8 @@ class TestRun(unittest.TestCase):
479
494
  }
480
495
  },
481
496
  eval_batch_size=10,
482
- limit=1,
483
- judge_strategy=JudgeStrategy.AUTO,
497
+ limit=3,
498
+ judge_strategy=JudgeStrategy.LLM,
484
499
  judge_worker_num=5,
485
500
  judge_model_args={
486
501
  'model_id': 'qwen2.5-72b-instruct',
@@ -499,9 +514,9 @@ class TestRun(unittest.TestCase):
499
514
  },
500
515
  timeout=60000,
501
516
  stream=True,
502
- analysis_report=True,
517
+ # analysis_report=True,
503
518
  # debug=True,
504
- # use_cache='outputs/20250602_135859'
519
+ # use_cache='outputs/20250616_161931'
505
520
  )
506
521
 
507
522
  run_task(task_cfg=task_cfg)
@@ -39,7 +39,11 @@ class TestCLIPBenchmark(unittest.TestCase):
39
39
  'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
40
40
  }
41
41
  ],
42
- 'dataset_name': ['muge', 'mnist'],
42
+ 'dataset_name': [
43
+ 'muge',
44
+ 'mnist',
45
+ 'flickr8k'
46
+ ],
43
47
  'split': 'test',
44
48
  'batch_size': 128,
45
49
  'num_workers': 1,
tests/rag/test_mteb.py CHANGED
@@ -121,10 +121,54 @@ class TestMTEB(unittest.TestCase):
121
121
  },
122
122
  ],
123
123
  'eval': {
124
- 'tasks': ['MedicalRetrieval', 'T2Retrieval'],
124
+ 'tasks': [
125
+ 'MedicalRetrieval',
126
+ 'T2Retrieval'
127
+ ],
125
128
  'verbosity': 2,
126
129
  'overwrite_results': True,
127
- # 'limits': 10,
130
+ 'limits': 10,
131
+ 'top_k': 10,
132
+ },
133
+ },
134
+ }
135
+
136
+ run_task(task_cfg)
137
+
138
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
139
+ def test_run_two_stage_api(self):
140
+ task_cfg = {
141
+ 'eval_backend': 'RAGEval',
142
+ 'eval_config': {
143
+ 'tool': 'MTEB',
144
+ 'model': [
145
+ {
146
+ 'model_name': 'text-embedding-v3',
147
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
148
+ 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
149
+ 'dimensions': 1024,
150
+ 'encode_kwargs': {
151
+ 'batch_size': 10,
152
+ },
153
+ },
154
+ {
155
+ 'model_name': 'text-embedding-v3',
156
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
157
+ 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
158
+ 'dimensions': 1024,
159
+ 'encode_kwargs': {
160
+ 'batch_size': 10,
161
+ },
162
+ },
163
+ ],
164
+ 'eval': {
165
+ 'tasks': [
166
+ 'MedicalRetrieval',
167
+ # 'T2Retrieval'
168
+ ],
169
+ 'verbosity': 2,
170
+ 'overwrite_results': True,
171
+ 'limits': 10,
128
172
  'top_k': 10,
129
173
  },
130
174
  },