evalscope 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (61) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +20 -25
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  7. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  8. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  9. evalscope/backend/rag_eval/utils/embedding.py +75 -35
  10. evalscope/benchmarks/benchmark.py +1 -0
  11. evalscope/benchmarks/data_adapter.py +97 -16
  12. evalscope/benchmarks/docmath/__init__.py +0 -0
  13. evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  14. evalscope/benchmarks/docmath/utils.py +220 -0
  15. evalscope/benchmarks/frames/__init__.py +0 -0
  16. evalscope/benchmarks/frames/frames_adapter.py +90 -0
  17. evalscope/benchmarks/frames/utils.py +37 -0
  18. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  19. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  20. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  21. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +4 -1
  22. evalscope/benchmarks/tool_bench/utils.py +5 -4
  23. evalscope/benchmarks/utils.py +25 -0
  24. evalscope/cli/start_app.py +2 -2
  25. evalscope/collections/__init__.py +35 -3
  26. evalscope/collections/evaluator.py +18 -6
  27. evalscope/config.py +8 -2
  28. evalscope/evaluator/evaluator.py +38 -27
  29. evalscope/metrics/__init__.py +3 -1
  30. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  31. evalscope/metrics/llm_judge.py +12 -5
  32. evalscope/metrics/math_parser.py +1 -1
  33. evalscope/models/adapters/server_adapter.py +2 -6
  34. evalscope/perf/arguments.py +2 -2
  35. evalscope/perf/benchmark.py +0 -9
  36. evalscope/perf/main.py +7 -0
  37. evalscope/perf/plugin/datasets/custom.py +15 -0
  38. evalscope/perf/utils/benchmark_util.py +1 -1
  39. evalscope/perf/utils/local_server.py +1 -0
  40. evalscope/perf/utils/log_utils.py +12 -5
  41. evalscope/perf/utils/rich_display.py +1 -1
  42. evalscope/report/__init__.py +36 -4
  43. evalscope/report/combinator.py +8 -0
  44. evalscope/report/generator.py +33 -9
  45. evalscope/report/utils.py +60 -3
  46. evalscope/run.py +12 -0
  47. evalscope/utils/logger.py +1 -1
  48. evalscope/utils/utils.py +12 -0
  49. evalscope/version.py +2 -2
  50. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/METADATA +13 -11
  51. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/RECORD +61 -50
  52. tests/aigc/test_t2i.py +40 -3
  53. tests/cli/test_all.py +39 -35
  54. tests/cli/test_collection.py +7 -6
  55. tests/cli/test_run.py +21 -11
  56. tests/rag/test_mteb.py +5 -5
  57. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  58. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
  59. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
  60. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
  61. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py CHANGED
@@ -13,7 +13,7 @@ from evalscope.run import run_task
13
13
  from evalscope.utils import is_module_installed, test_level_list
14
14
  from evalscope.utils.logger import get_logger
15
15
 
16
- os.environ['LOG_LEVEL'] = 'DEBUG'
16
+ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
17
17
 
18
18
  logger = get_logger()
19
19
 
@@ -311,7 +311,8 @@ class TestRun(unittest.TestCase):
311
311
  # 'maritime_bench',
312
312
  # 'drop',
313
313
  # 'winogrande',
314
- 'tool_bench',
314
+ # 'tool_bench',
315
+ 'frames',
315
316
  ],
316
317
  dataset_args={
317
318
  'mmlu': {
@@ -384,9 +385,10 @@ class TestRun(unittest.TestCase):
384
385
  'temperature': 0,
385
386
  'n': 1,
386
387
  'max_tokens': 4096,
388
+ # 'extra_headers':{'key': 'value'},
387
389
  },
388
390
  # ignore_errors=True,
389
- use_cache='outputs/20250519_142106'
391
+ # use_cache='outputs/20250519_142106'
390
392
  )
391
393
 
392
394
  run_task(task_cfg=task_cfg)
@@ -427,24 +429,27 @@ class TestRun(unittest.TestCase):
427
429
  from evalscope.config import TaskConfig
428
430
 
429
431
  task_cfg = TaskConfig(
430
- model='qwen2.5-0.5b-instruct',
432
+ model='qwen-plus',
431
433
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
432
434
  api_key= env.get('DASHSCOPE_API_KEY'),
433
435
  eval_type=EvalType.SERVICE,
434
436
  datasets=[
435
437
  # 'math_500',
436
- # 'aime24',
438
+ 'aime24',
437
439
  # 'competition_math',
438
440
  # 'arc',
439
- # 'gsm8k'
441
+ # 'gsm8k',
440
442
  # 'truthful_qa',
441
443
  # 'simple_qa',
442
- 'chinese_simpleqa',
444
+ # 'chinese_simpleqa',
443
445
  # 'live_code_bench',
444
446
  # 'humaneval',
445
447
  # 'general_qa',
446
448
  # 'alpaca_eval',
447
- # 'arena_hard'
449
+ # 'arena_hard',
450
+ # 'frames',
451
+ # 'docmath',
452
+ # 'needle_haystack',
448
453
  ],
449
454
  dataset_args={
450
455
  'competition_math': {
@@ -469,13 +474,16 @@ class TestRun(unittest.TestCase):
469
474
  '中华文化'
470
475
  ]
471
476
  },
477
+ 'frames': {
478
+ 'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
479
+ }
472
480
  },
473
481
  eval_batch_size=10,
474
- limit=10,
482
+ limit=1,
475
483
  judge_strategy=JudgeStrategy.AUTO,
476
484
  judge_worker_num=5,
477
485
  judge_model_args={
478
- 'model_id': 'qwen2.5-7b-instruct',
486
+ 'model_id': 'qwen2.5-72b-instruct',
479
487
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
480
488
  'api_key': env.get('DASHSCOPE_API_KEY'),
481
489
  'generation_config': {
@@ -491,7 +499,9 @@ class TestRun(unittest.TestCase):
491
499
  },
492
500
  timeout=60000,
493
501
  stream=True,
494
- use_cache='outputs/20250519_142551'
502
+ analysis_report=True,
503
+ # debug=True,
504
+ # use_cache='outputs/20250602_135859'
495
505
  )
496
506
 
497
507
  run_task(task_cfg=task_cfg)
tests/rag/test_mteb.py CHANGED
@@ -46,11 +46,11 @@ class TestMTEB(unittest.TestCase):
46
46
  ],
47
47
  'eval': {
48
48
  'tasks': [
49
- # 'TNews',
50
- # 'CLSClusteringS2S',
49
+ 'TNews',
50
+ 'CLSClusteringS2S',
51
51
  'T2Reranking',
52
- # 'T2Retrieval',
53
- # 'ATEC',
52
+ 'T2Retrieval',
53
+ 'ATEC',
54
54
  ],
55
55
  'verbosity': 2,
56
56
  'overwrite_results': True,
@@ -85,7 +85,7 @@ class TestMTEB(unittest.TestCase):
85
85
  ],
86
86
  'verbosity': 2,
87
87
  'overwrite_results': True,
88
- 'limits': 30,
88
+ 'limits': 10,
89
89
  },
90
90
  },
91
91
  )
File without changes