evalscope 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +20 -25
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/data_adapter.py +97 -16
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +4 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/utils.py +25 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +18 -6
- evalscope/config.py +8 -2
- evalscope/evaluator/evaluator.py +38 -27
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/models/adapters/server_adapter.py +2 -6
- evalscope/perf/arguments.py +2 -2
- evalscope/perf/benchmark.py +0 -9
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +60 -3
- evalscope/run.py +12 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/METADATA +13 -11
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/RECORD +61 -50
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -35
- tests/cli/test_collection.py +7 -6
- tests/cli/test_run.py +21 -11
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py
CHANGED
|
@@ -13,7 +13,7 @@ from evalscope.run import run_task
|
|
|
13
13
|
from evalscope.utils import is_module_installed, test_level_list
|
|
14
14
|
from evalscope.utils.logger import get_logger
|
|
15
15
|
|
|
16
|
-
os.environ['
|
|
16
|
+
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
17
17
|
|
|
18
18
|
logger = get_logger()
|
|
19
19
|
|
|
@@ -311,7 +311,8 @@ class TestRun(unittest.TestCase):
|
|
|
311
311
|
# 'maritime_bench',
|
|
312
312
|
# 'drop',
|
|
313
313
|
# 'winogrande',
|
|
314
|
-
'tool_bench',
|
|
314
|
+
# 'tool_bench',
|
|
315
|
+
'frames',
|
|
315
316
|
],
|
|
316
317
|
dataset_args={
|
|
317
318
|
'mmlu': {
|
|
@@ -384,9 +385,10 @@ class TestRun(unittest.TestCase):
|
|
|
384
385
|
'temperature': 0,
|
|
385
386
|
'n': 1,
|
|
386
387
|
'max_tokens': 4096,
|
|
388
|
+
# 'extra_headers':{'key': 'value'},
|
|
387
389
|
},
|
|
388
390
|
# ignore_errors=True,
|
|
389
|
-
use_cache='outputs/20250519_142106'
|
|
391
|
+
# use_cache='outputs/20250519_142106'
|
|
390
392
|
)
|
|
391
393
|
|
|
392
394
|
run_task(task_cfg=task_cfg)
|
|
@@ -427,24 +429,27 @@ class TestRun(unittest.TestCase):
|
|
|
427
429
|
from evalscope.config import TaskConfig
|
|
428
430
|
|
|
429
431
|
task_cfg = TaskConfig(
|
|
430
|
-
model='
|
|
432
|
+
model='qwen-plus',
|
|
431
433
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
432
434
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
433
435
|
eval_type=EvalType.SERVICE,
|
|
434
436
|
datasets=[
|
|
435
437
|
# 'math_500',
|
|
436
|
-
|
|
438
|
+
'aime24',
|
|
437
439
|
# 'competition_math',
|
|
438
440
|
# 'arc',
|
|
439
|
-
# 'gsm8k'
|
|
441
|
+
# 'gsm8k',
|
|
440
442
|
# 'truthful_qa',
|
|
441
443
|
# 'simple_qa',
|
|
442
|
-
'chinese_simpleqa',
|
|
444
|
+
# 'chinese_simpleqa',
|
|
443
445
|
# 'live_code_bench',
|
|
444
446
|
# 'humaneval',
|
|
445
447
|
# 'general_qa',
|
|
446
448
|
# 'alpaca_eval',
|
|
447
|
-
# 'arena_hard'
|
|
449
|
+
# 'arena_hard',
|
|
450
|
+
# 'frames',
|
|
451
|
+
# 'docmath',
|
|
452
|
+
# 'needle_haystack',
|
|
448
453
|
],
|
|
449
454
|
dataset_args={
|
|
450
455
|
'competition_math': {
|
|
@@ -469,13 +474,16 @@ class TestRun(unittest.TestCase):
|
|
|
469
474
|
'中华文化'
|
|
470
475
|
]
|
|
471
476
|
},
|
|
477
|
+
'frames': {
|
|
478
|
+
'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
|
|
479
|
+
}
|
|
472
480
|
},
|
|
473
481
|
eval_batch_size=10,
|
|
474
|
-
limit=
|
|
482
|
+
limit=1,
|
|
475
483
|
judge_strategy=JudgeStrategy.AUTO,
|
|
476
484
|
judge_worker_num=5,
|
|
477
485
|
judge_model_args={
|
|
478
|
-
'model_id': 'qwen2.5-
|
|
486
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
479
487
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
480
488
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
481
489
|
'generation_config': {
|
|
@@ -491,7 +499,9 @@ class TestRun(unittest.TestCase):
|
|
|
491
499
|
},
|
|
492
500
|
timeout=60000,
|
|
493
501
|
stream=True,
|
|
494
|
-
|
|
502
|
+
analysis_report=True,
|
|
503
|
+
# debug=True,
|
|
504
|
+
# use_cache='outputs/20250602_135859'
|
|
495
505
|
)
|
|
496
506
|
|
|
497
507
|
run_task(task_cfg=task_cfg)
|
tests/rag/test_mteb.py
CHANGED
|
@@ -46,11 +46,11 @@ class TestMTEB(unittest.TestCase):
|
|
|
46
46
|
],
|
|
47
47
|
'eval': {
|
|
48
48
|
'tasks': [
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
'TNews',
|
|
50
|
+
'CLSClusteringS2S',
|
|
51
51
|
'T2Reranking',
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
'T2Retrieval',
|
|
53
|
+
'ATEC',
|
|
54
54
|
],
|
|
55
55
|
'verbosity': 2,
|
|
56
56
|
'overwrite_results': True,
|
|
@@ -85,7 +85,7 @@ class TestMTEB(unittest.TestCase):
|
|
|
85
85
|
],
|
|
86
86
|
'verbosity': 2,
|
|
87
87
|
'overwrite_results': True,
|
|
88
|
-
'limits':
|
|
88
|
+
'limits': 10,
|
|
89
89
|
},
|
|
90
90
|
},
|
|
91
91
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|