evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +16 -9
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +16 -4
- evalscope/config.py +7 -3
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +9 -3
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/benchmark.py +4 -3
- evalscope/perf/main.py +4 -2
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/utils/benchmark_util.py +2 -2
- evalscope/perf/utils/db_util.py +16 -8
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +117 -67
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +3 -3
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +12 -4
- evalscope/version.py +2 -2
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_run.py +20 -7
- tests/perf/test_perf.py +6 -3
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def add_argument(parser: argparse.ArgumentParser):
|
|
5
|
+
parser.add_argument('--share', action='store_true', help='Share the app.')
|
|
6
|
+
parser.add_argument('--server-name', type=str, default='0.0.0.0', help='The server name.')
|
|
7
|
+
parser.add_argument('--server-port', type=int, default=None, help='The server port.')
|
|
8
|
+
parser.add_argument('--debug', action='store_true', help='Debug the app.')
|
|
9
|
+
parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
|
|
10
|
+
parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
|
|
11
|
+
parser.add_argument('--allowed-paths', nargs='+', default=['/'], help='The outputs dir.')
|
evalscope/report/generator.py
CHANGED
|
@@ -48,7 +48,7 @@ class ReportGenerator:
|
|
|
48
48
|
df = flatten_subset()
|
|
49
49
|
|
|
50
50
|
metrics_list = []
|
|
51
|
-
for metric_name, group_metric in df.groupby('metric_name'):
|
|
51
|
+
for metric_name, group_metric in df.groupby('metric_name', sort=False):
|
|
52
52
|
categories = []
|
|
53
53
|
for category_name, group_category in group_metric.groupby('categories'):
|
|
54
54
|
subsets = []
|
evalscope/run.py
CHANGED
|
@@ -153,10 +153,10 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
153
153
|
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
154
154
|
return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
|
|
155
155
|
|
|
156
|
-
# Initialize
|
|
157
|
-
model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
|
|
158
|
-
# Initialize data adapter
|
|
156
|
+
# Initialize data adapter first to update config
|
|
159
157
|
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
158
|
+
# Initialize model adapter
|
|
159
|
+
model_adapter = initialize_model_adapter(task_cfg, data_adapter, base_model)
|
|
160
160
|
|
|
161
161
|
# update task_cfg.dataset_args
|
|
162
162
|
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
@@ -357,7 +357,7 @@ judge_config = dict(
|
|
|
357
357
|
)
|
|
358
358
|
|
|
359
359
|
distill_qwen_config = dict(
|
|
360
|
-
report_path = '
|
|
360
|
+
report_path = '../eval-scope/outputs/20250218_180219',
|
|
361
361
|
model_name = 'DeepSeek-R1-Distill-Qwen-7B',
|
|
362
362
|
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
|
363
363
|
dataset_name = 'math_500',
|
|
@@ -367,7 +367,7 @@ distill_qwen_config = dict(
|
|
|
367
367
|
)
|
|
368
368
|
|
|
369
369
|
math_qwen_config = dict(
|
|
370
|
-
report_path = '
|
|
370
|
+
report_path = '../eval-scope/outputs/20250219_202358',
|
|
371
371
|
model_name = 'Qwen2.5-Math-7B-Instruct',
|
|
372
372
|
tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
|
|
373
373
|
dataset_name = 'math_500',
|
|
@@ -377,7 +377,7 @@ math_qwen_config = dict(
|
|
|
377
377
|
)
|
|
378
378
|
|
|
379
379
|
r1_config = dict(
|
|
380
|
-
report_path = '
|
|
380
|
+
report_path = '../eval-scope/outputs/20250307_000404',
|
|
381
381
|
model_name = 'deepseek-r1',
|
|
382
382
|
tokenizer_path = 'deepseek-ai/DeepSeek-R1',
|
|
383
383
|
dataset_name = 'math_500',
|
|
@@ -387,7 +387,7 @@ r1_config = dict(
|
|
|
387
387
|
)
|
|
388
388
|
|
|
389
389
|
qwq_preview_config = dict(
|
|
390
|
-
report_path = '
|
|
390
|
+
report_path = '../eval-scope/outputs/20250221_105911',
|
|
391
391
|
model_name = 'qwq-32b-preview',
|
|
392
392
|
tokenizer_path = 'Qwen/QwQ-32B-Preview',
|
|
393
393
|
dataset_name = 'math_500',
|
|
@@ -397,7 +397,7 @@ qwq_preview_config = dict(
|
|
|
397
397
|
)
|
|
398
398
|
|
|
399
399
|
qwq_config = dict(
|
|
400
|
-
report_path = '
|
|
400
|
+
report_path = '../eval-scope/outputs/20250306_181550',
|
|
401
401
|
model_name = 'QwQ-32B',
|
|
402
402
|
tokenizer_path = 'Qwen/QwQ-32B',
|
|
403
403
|
dataset_name = 'math_500',
|
|
@@ -407,7 +407,7 @@ qwq_config = dict(
|
|
|
407
407
|
)
|
|
408
408
|
|
|
409
409
|
distill_qwen_32b = dict(
|
|
410
|
-
report_path = '
|
|
410
|
+
report_path = '../eval-scope/outputs/20250306_235951',
|
|
411
411
|
model_name = 'deepseek-r1-distill-qwen-32b',
|
|
412
412
|
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
|
|
413
413
|
dataset_name = 'math_500',
|
|
@@ -416,14 +416,26 @@ distill_qwen_32b = dict(
|
|
|
416
416
|
judge_config=judge_config
|
|
417
417
|
)
|
|
418
418
|
|
|
419
|
+
qwen3_32b_think = dict(
|
|
420
|
+
report_path = '../eval-scope/outputs/20250428_151817',
|
|
421
|
+
model_name = 'Qwen3-32B',
|
|
422
|
+
tokenizer_path = 'Qwen/Qwen3-32B',
|
|
423
|
+
dataset_name = 'math_500',
|
|
424
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
425
|
+
split_strategies='separator',
|
|
426
|
+
judge_config=judge_config
|
|
427
|
+
)
|
|
428
|
+
|
|
419
429
|
if __name__ == '__main__':
|
|
420
430
|
# run_task(distill_qwen_config, count=80)
|
|
421
431
|
# run_task(math_qwen_config)
|
|
422
432
|
# run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
|
|
423
433
|
# run_task(r1_config, max_tokens=20000, count=200, workers=128)
|
|
424
434
|
# run_task(qwq_config, max_tokens=20000, count=200, workers=128)
|
|
435
|
+
run_task(qwen3_32b_think, max_tokens=20000, count=200, workers=128)
|
|
425
436
|
# run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
|
|
426
437
|
|
|
427
438
|
# combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
|
|
428
439
|
# combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
|
|
429
|
-
combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
|
|
440
|
+
# combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
|
|
441
|
+
combine_results([qwq_config, r1_config, distill_qwen_32b, qwen3_32b_think], output_path='outputs/model_comparison_metrics_4models.png')
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -64,10 +64,10 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
|
64
64
|
|
|
65
65
|
class ChatCompletionResponse(BaseModel):
|
|
66
66
|
model: str
|
|
67
|
-
object: Literal['chat.completion', 'chat.completion.chunk']
|
|
67
|
+
object: Literal['chat.completion', 'chat.completion.chunk', 'images.generations']
|
|
68
68
|
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
|
|
69
69
|
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
|
|
70
|
-
usage: Optional[Usage]
|
|
70
|
+
usage: Optional[Usage] = None
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
class TextCompletionRequest(BaseModel):
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# Copyright 2023-present the HuggingFace Inc. team.
|
|
3
|
+
|
|
4
|
+
import importlib
|
|
5
|
+
import os
|
|
6
|
+
from itertools import chain
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from .logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger() # pylint: disable=invalid-name
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _LazyModule(ModuleType):
|
|
16
|
+
"""
|
|
17
|
+
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# Very heavily inspired by optuna.integration._IntegrationModule
|
|
21
|
+
# https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
|
|
22
|
+
def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
|
|
23
|
+
super().__init__(name)
|
|
24
|
+
self._modules = set(import_structure.keys())
|
|
25
|
+
self._class_to_module = {}
|
|
26
|
+
for key, values in import_structure.items():
|
|
27
|
+
for value in values:
|
|
28
|
+
self._class_to_module[value] = key
|
|
29
|
+
# Needed for autocompletion in an IDE
|
|
30
|
+
self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
|
|
31
|
+
self.__file__ = module_file
|
|
32
|
+
self.__spec__ = module_spec
|
|
33
|
+
self.__path__ = [os.path.dirname(module_file)]
|
|
34
|
+
self._objects = {} if extra_objects is None else extra_objects
|
|
35
|
+
self._name = name
|
|
36
|
+
self._import_structure = import_structure
|
|
37
|
+
|
|
38
|
+
# Needed for autocompletion in an IDE
|
|
39
|
+
def __dir__(self):
|
|
40
|
+
result = super().__dir__()
|
|
41
|
+
# The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
|
|
42
|
+
# they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
|
|
43
|
+
for attr in self.__all__:
|
|
44
|
+
if attr not in result:
|
|
45
|
+
result.append(attr)
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def __getattr__(self, name: str) -> Any:
|
|
49
|
+
if name in self._objects:
|
|
50
|
+
return self._objects[name]
|
|
51
|
+
if name in self._modules:
|
|
52
|
+
value = self._get_module(name)
|
|
53
|
+
elif name in self._class_to_module.keys():
|
|
54
|
+
module = self._get_module(self._class_to_module[name])
|
|
55
|
+
value = getattr(module, name)
|
|
56
|
+
else:
|
|
57
|
+
raise AttributeError(f'module {self.__name__} has no attribute {name}')
|
|
58
|
+
|
|
59
|
+
setattr(self, name, value)
|
|
60
|
+
return value
|
|
61
|
+
|
|
62
|
+
def _get_module(self, module_name: str):
|
|
63
|
+
return importlib.import_module('.' + module_name, self.__name__)
|
|
64
|
+
|
|
65
|
+
def __reduce__(self):
|
|
66
|
+
return self.__class__, (self._name, self.__file__, self._import_structure)
|
evalscope/utils/utils.py
CHANGED
|
@@ -76,16 +76,16 @@ def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
|
|
|
76
76
|
class ResponseParser:
|
|
77
77
|
|
|
78
78
|
@staticmethod
|
|
79
|
-
def parse_first_capital(text: str) -> str:
|
|
79
|
+
def parse_first_capital(text: str, options: list[str]) -> str:
|
|
80
80
|
for t in text:
|
|
81
|
-
if t.isupper():
|
|
81
|
+
if t.isupper() and (t in options):
|
|
82
82
|
return t
|
|
83
83
|
return ''
|
|
84
84
|
|
|
85
85
|
@staticmethod
|
|
86
|
-
def parse_last_capital(text: str) -> str:
|
|
86
|
+
def parse_last_capital(text: str, options: list[str]) -> str:
|
|
87
87
|
for t in text[::-1]:
|
|
88
|
-
if t.isupper():
|
|
88
|
+
if t.isupper() and (t in options):
|
|
89
89
|
return t
|
|
90
90
|
return ''
|
|
91
91
|
|
|
@@ -155,6 +155,10 @@ class ResponseParser:
|
|
|
155
155
|
for i in options:
|
|
156
156
|
if i in outputs:
|
|
157
157
|
return i
|
|
158
|
+
# If no match found, try to find the last capital letter in the text
|
|
159
|
+
last_capital = ResponseParser.parse_last_capital(text, options)
|
|
160
|
+
if last_capital:
|
|
161
|
+
return last_capital
|
|
158
162
|
return 'No valid option found'
|
|
159
163
|
|
|
160
164
|
@staticmethod
|
|
@@ -183,6 +187,10 @@ class ResponseParser:
|
|
|
183
187
|
matches = regex.search(text)
|
|
184
188
|
if matches:
|
|
185
189
|
return matches.group(1)
|
|
190
|
+
# If no match found, try to find the last capital letter in the text
|
|
191
|
+
last_capital = ResponseParser.parse_last_capital(text, options)
|
|
192
|
+
if last_capital:
|
|
193
|
+
return last_capital
|
|
186
194
|
return 'No valid option found'
|
|
187
195
|
|
|
188
196
|
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -28,8 +28,9 @@ Requires-Dist: modelscope[framework]
|
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
29
29
|
Requires-Dist: openai
|
|
30
30
|
Requires-Dist: pandas
|
|
31
|
+
Requires-Dist: pillow
|
|
31
32
|
Requires-Dist: pyarrow
|
|
32
|
-
Requires-Dist: pyyaml
|
|
33
|
+
Requires-Dist: pyyaml>=5.1
|
|
33
34
|
Requires-Dist: requests
|
|
34
35
|
Requires-Dist: rouge-chinese
|
|
35
36
|
Requires-Dist: rouge-score>=0.1.0
|
|
@@ -39,9 +40,16 @@ Requires-Dist: seaborn
|
|
|
39
40
|
Requires-Dist: sympy
|
|
40
41
|
Requires-Dist: tabulate
|
|
41
42
|
Requires-Dist: torch
|
|
43
|
+
Requires-Dist: torchvision
|
|
42
44
|
Requires-Dist: tqdm
|
|
43
45
|
Requires-Dist: transformers>=4.33
|
|
44
46
|
Requires-Dist: word2number
|
|
47
|
+
Provides-Extra: aigc
|
|
48
|
+
Requires-Dist: diffusers; extra == "aigc"
|
|
49
|
+
Requires-Dist: iopath; extra == "aigc"
|
|
50
|
+
Requires-Dist: omegaconf; extra == "aigc"
|
|
51
|
+
Requires-Dist: open-clip-torch; extra == "aigc"
|
|
52
|
+
Requires-Dist: opencv-python; extra == "aigc"
|
|
45
53
|
Provides-Extra: all
|
|
46
54
|
Requires-Dist: accelerate; extra == "all"
|
|
47
55
|
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
@@ -55,8 +63,9 @@ Requires-Dist: modelscope[framework]; extra == "all"
|
|
|
55
63
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
56
64
|
Requires-Dist: openai; extra == "all"
|
|
57
65
|
Requires-Dist: pandas; extra == "all"
|
|
66
|
+
Requires-Dist: pillow; extra == "all"
|
|
58
67
|
Requires-Dist: pyarrow; extra == "all"
|
|
59
|
-
Requires-Dist: pyyaml; extra == "all"
|
|
68
|
+
Requires-Dist: pyyaml>=5.1; extra == "all"
|
|
60
69
|
Requires-Dist: requests; extra == "all"
|
|
61
70
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
62
71
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
@@ -66,6 +75,7 @@ Requires-Dist: seaborn; extra == "all"
|
|
|
66
75
|
Requires-Dist: sympy; extra == "all"
|
|
67
76
|
Requires-Dist: tabulate; extra == "all"
|
|
68
77
|
Requires-Dist: torch; extra == "all"
|
|
78
|
+
Requires-Dist: torchvision; extra == "all"
|
|
69
79
|
Requires-Dist: tqdm; extra == "all"
|
|
70
80
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
71
81
|
Requires-Dist: word2number; extra == "all"
|
|
@@ -86,6 +96,11 @@ Requires-Dist: transformers; extra == "all"
|
|
|
86
96
|
Requires-Dist: unicorn; extra == "all"
|
|
87
97
|
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
88
98
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
99
|
+
Requires-Dist: diffusers; extra == "all"
|
|
100
|
+
Requires-Dist: iopath; extra == "all"
|
|
101
|
+
Requires-Dist: omegaconf; extra == "all"
|
|
102
|
+
Requires-Dist: open-clip-torch; extra == "all"
|
|
103
|
+
Requires-Dist: opencv-python; extra == "all"
|
|
89
104
|
Provides-Extra: app
|
|
90
105
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
91
106
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
@@ -199,6 +214,8 @@ Please scan the QR code below to join our community groups:
|
|
|
199
214
|
|
|
200
215
|
## 🎉 News
|
|
201
216
|
|
|
217
|
+
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
218
|
+
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
202
219
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
203
220
|
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
204
221
|
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|