evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +4 -4
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +2 -2
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +21 -10
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
- evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +20 -6
- evalscope/config.py +8 -4
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +2 -2
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/arguments.py +24 -5
- evalscope/perf/benchmark.py +28 -42
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/utils/benchmark_util.py +14 -8
- evalscope/perf/utils/db_util.py +9 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +128 -78
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +10 -3
- evalscope/summarizer.py +2 -1
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +48 -29
- evalscope/version.py +2 -2
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/METADATA +37 -15
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/RECORD +209 -96
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_all.py +4 -4
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +19 -12
- tests/perf/test_perf.py +3 -3
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +29 -26
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@ logger = get_logger()
|
|
|
11
11
|
@dataclass
|
|
12
12
|
class BenchmarkData:
|
|
13
13
|
request: Any = None
|
|
14
|
-
start_time: float =
|
|
14
|
+
start_time: float = 0.0
|
|
15
15
|
completed_time: float = 0.0
|
|
16
16
|
chunk_times: List[float] = field(default_factory=list)
|
|
17
17
|
success: bool = False
|
|
@@ -32,13 +32,13 @@ class BenchmarkData:
|
|
|
32
32
|
self.query_latency = self.completed_time - self.start_time
|
|
33
33
|
if len(self.chunk_times) > 1:
|
|
34
34
|
self.first_chunk_latency = self.chunk_times[0] - self.start_time
|
|
35
|
-
self.n_chunks = len(self.chunk_times) - 2
|
|
35
|
+
self.n_chunks = len(self.chunk_times) - 2 # remove last and first chunk
|
|
36
36
|
self.n_chunks_time = self.chunk_times[-2] - self.chunk_times[0]
|
|
37
37
|
else:
|
|
38
38
|
self.first_chunk_latency = self.query_latency
|
|
39
39
|
self.n_chunks = 1
|
|
40
40
|
self.n_chunks_time = self.query_latency
|
|
41
|
-
self.time_per_output_token = self.
|
|
41
|
+
self.time_per_output_token = self.n_chunks_time / self.completion_tokens
|
|
42
42
|
|
|
43
43
|
def _calculate_tokens(self, api_plugin):
|
|
44
44
|
self.prompt_tokens, self.completion_tokens = \
|
|
@@ -73,7 +73,9 @@ class BenchmarkMetrics:
|
|
|
73
73
|
avg_chunk_time: float = -1
|
|
74
74
|
avg_prompt_tokens: float = -1
|
|
75
75
|
avg_completion_tokens: float = -1
|
|
76
|
-
|
|
76
|
+
avg_input_token_per_seconds: float = -1
|
|
77
|
+
avg_output_token_per_seconds: float = -1
|
|
78
|
+
avg_total_token_per_seconds: float = -1
|
|
77
79
|
avg_time_per_token: float = -1
|
|
78
80
|
qps: float = -1
|
|
79
81
|
|
|
@@ -111,22 +113,26 @@ class BenchmarkMetrics:
|
|
|
111
113
|
self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
|
|
112
114
|
self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
|
|
113
115
|
self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
|
|
114
|
-
self.
|
|
116
|
+
self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
|
|
117
|
+
self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
|
|
118
|
+
self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
|
|
119
|
+
+ self.n_total_completion_tokens) / self.total_time
|
|
115
120
|
self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
|
|
116
121
|
self.qps = self.n_succeed_queries / self.total_time
|
|
117
122
|
except ZeroDivisionError as e:
|
|
118
123
|
logger.exception(e)
|
|
119
124
|
return
|
|
120
125
|
|
|
121
|
-
def create_message(self, default_ndigits=
|
|
126
|
+
def create_message(self, default_ndigits=4):
|
|
122
127
|
message = {
|
|
123
128
|
'Time taken for tests (s)': round(self.total_time, default_ndigits),
|
|
124
129
|
'Number of concurrency': self.concurrency,
|
|
125
130
|
'Total requests': int(self.n_total_queries),
|
|
126
131
|
'Succeed requests': self.n_succeed_queries,
|
|
127
132
|
'Failed requests': self.n_failed_queries,
|
|
128
|
-
'
|
|
129
|
-
'
|
|
133
|
+
'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
|
|
134
|
+
'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
|
|
135
|
+
'Request throughput (req/s)': round(self.qps, default_ndigits),
|
|
130
136
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
131
137
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
132
138
|
'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -165,6 +165,7 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
165
165
|
CHUNK_TIMES_INDEX = 1
|
|
166
166
|
LATENCY_INDEX = 4
|
|
167
167
|
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
168
|
+
CHUNK_TIME_INDEX = 7
|
|
168
169
|
PROMPT_TOKENS_INDEX = 8
|
|
169
170
|
COMPLETION_TOKENS_INDEX = 9
|
|
170
171
|
|
|
@@ -175,14 +176,19 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
175
176
|
|
|
176
177
|
metrics = {
|
|
177
178
|
'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
178
|
-
'
|
|
179
|
+
'ITL (s)':
|
|
179
180
|
inter_token_latencies_all,
|
|
181
|
+
'TPOT (s)':
|
|
182
|
+
[(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
|
|
183
|
+
for row in rows],
|
|
180
184
|
'Latency (s)': [row[LATENCY_INDEX] for row in rows],
|
|
181
185
|
'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
182
186
|
'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
183
|
-
'
|
|
187
|
+
'Output throughput(tok/s)':
|
|
184
188
|
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
185
|
-
for row in rows]
|
|
189
|
+
for row in rows],
|
|
190
|
+
'Total throughput(tok/s)': [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
|
|
191
|
+
/ row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan') for row in rows]
|
|
186
192
|
}
|
|
187
193
|
|
|
188
194
|
# Calculate percentiles for each metric
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from evalscope.perf.arguments import Arguments
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def init_wandb(args: Arguments) -> None:
|
|
7
|
+
"""
|
|
8
|
+
Initialize WandB for logging.
|
|
9
|
+
"""
|
|
10
|
+
# Initialize wandb if the api key is provided
|
|
11
|
+
import datetime
|
|
12
|
+
try:
|
|
13
|
+
import wandb
|
|
14
|
+
except ImportError:
|
|
15
|
+
raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
|
|
16
|
+
os.environ['WANDB_SILENT'] = 'true'
|
|
17
|
+
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
18
|
+
|
|
19
|
+
wandb.login(key=args.wandb_api_key)
|
|
20
|
+
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
21
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
22
|
+
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def init_swanlab(args: Arguments) -> None:
|
|
26
|
+
import datetime
|
|
27
|
+
try:
|
|
28
|
+
import swanlab
|
|
29
|
+
except ImportError:
|
|
30
|
+
raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
|
|
31
|
+
os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
|
|
32
|
+
if not args.swanlab_api_key == 'local':
|
|
33
|
+
swanlab.login(api_key=args.swanlab_api_key)
|
|
34
|
+
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
35
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
|
+
swanlab.config.update({'framework': '📏evalscope'})
|
|
37
|
+
swanlab.init(
|
|
38
|
+
project='perf_benchmark',
|
|
39
|
+
name=name,
|
|
40
|
+
config=args.to_dict(),
|
|
41
|
+
mode='local' if args.swanlab_api_key == 'local' else None)
|
evalscope/report/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
from evalscope.report.app_arguments import add_argument
|
|
3
4
|
from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
|
|
4
5
|
from evalscope.report.generator import ReportGenerator
|
|
5
6
|
from evalscope.report.utils import Category, Report, ReportKey, Subset
|
evalscope/report/app.py
CHANGED
|
@@ -11,7 +11,7 @@ from dataclasses import dataclass
|
|
|
11
11
|
from typing import Any, List, Union
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import DataCollection
|
|
14
|
-
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
14
|
+
from evalscope.report import Report, ReportKey, add_argument, get_data_frame, get_report_list
|
|
15
15
|
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
16
16
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
17
17
|
from evalscope.version import __version__
|
|
@@ -22,6 +22,23 @@ PLOTLY_THEME = 'plotly_dark'
|
|
|
22
22
|
REPORT_TOKEN = '@@'
|
|
23
23
|
MODEL_TOKEN = '::'
|
|
24
24
|
DATASET_TOKEN = ', '
|
|
25
|
+
LATEX_DELIMITERS = [{
|
|
26
|
+
'left': '$$',
|
|
27
|
+
'right': '$$',
|
|
28
|
+
'display': True
|
|
29
|
+
}, {
|
|
30
|
+
'left': '$',
|
|
31
|
+
'right': '$',
|
|
32
|
+
'display': False
|
|
33
|
+
}, {
|
|
34
|
+
'left': '\\(',
|
|
35
|
+
'right': '\\)',
|
|
36
|
+
'display': False
|
|
37
|
+
}, {
|
|
38
|
+
'left': '\\[',
|
|
39
|
+
'right': '\\]',
|
|
40
|
+
'display': True
|
|
41
|
+
}]
|
|
25
42
|
|
|
26
43
|
|
|
27
44
|
def scan_for_report_folders(root_path):
|
|
@@ -44,7 +61,7 @@ def scan_for_report_folders(root_path):
|
|
|
44
61
|
continue
|
|
45
62
|
datasets = []
|
|
46
63
|
for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
|
|
47
|
-
datasets.append(os.path.basename(dataset_item)
|
|
64
|
+
datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
|
|
48
65
|
datasets = DATASET_TOKEN.join(datasets)
|
|
49
66
|
reports.append(
|
|
50
67
|
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
|
|
@@ -234,6 +251,18 @@ def convert_html_tags(text):
|
|
|
234
251
|
return text
|
|
235
252
|
|
|
236
253
|
|
|
254
|
+
def convert_markdown_image(text):
|
|
255
|
+
if not os.path.isfile(text):
|
|
256
|
+
return text
|
|
257
|
+
# Convert the image path to a markdown image tag
|
|
258
|
+
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
259
|
+
text = os.path.abspath(text)
|
|
260
|
+
image_tag = f''
|
|
261
|
+
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
262
|
+
return image_tag
|
|
263
|
+
return text
|
|
264
|
+
|
|
265
|
+
|
|
237
266
|
def process_string(string: str, max_length: int = 2048) -> str:
|
|
238
267
|
string = convert_html_tags(string) # for display labels e.g. `<think>`
|
|
239
268
|
if len(string) > max_length:
|
|
@@ -253,17 +282,17 @@ def process_model_prediction(item: Any):
|
|
|
253
282
|
|
|
254
283
|
|
|
255
284
|
def normalize_score(score):
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
else:
|
|
263
|
-
try:
|
|
264
|
-
return float(score)
|
|
265
|
-
except (ValueError, TypeError):
|
|
285
|
+
try:
|
|
286
|
+
if isinstance(score, bool):
|
|
287
|
+
return 1.0 if score else 0.0
|
|
288
|
+
elif isinstance(score, dict):
|
|
289
|
+
for key in score:
|
|
290
|
+
return float(score[key])
|
|
266
291
|
return 0.0
|
|
292
|
+
else:
|
|
293
|
+
return float(score)
|
|
294
|
+
except (ValueError, TypeError):
|
|
295
|
+
return 0.0
|
|
267
296
|
|
|
268
297
|
|
|
269
298
|
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
|
|
@@ -285,7 +314,7 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
285
314
|
'Input': raw_input,
|
|
286
315
|
'Generated': raw_pred_answer,
|
|
287
316
|
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
288
|
-
'Pred': parsed_pred_answer
|
|
317
|
+
'Pred': parsed_pred_answer,
|
|
289
318
|
'Score': score,
|
|
290
319
|
'NScore': normalize_score(score)
|
|
291
320
|
}
|
|
@@ -295,22 +324,6 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
295
324
|
return df_subset
|
|
296
325
|
|
|
297
326
|
|
|
298
|
-
def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
|
|
299
|
-
if data_review_df is None:
|
|
300
|
-
return pd.DataFrame(), None
|
|
301
|
-
|
|
302
|
-
logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
|
|
303
|
-
start = (page - 1) * rows_per_page
|
|
304
|
-
end = start + rows_per_page
|
|
305
|
-
df_subset = data_review_df.iloc[start:end].copy()
|
|
306
|
-
df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
|
|
307
|
-
df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
|
|
308
|
-
df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
|
|
309
|
-
df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
|
|
310
|
-
styler = style_df(df_subset, columns=['NScore'])
|
|
311
|
-
return df_subset, styler
|
|
312
|
-
|
|
313
|
-
|
|
314
327
|
@dataclass
|
|
315
328
|
class SidebarComponents:
|
|
316
329
|
root_path: gr.Textbox
|
|
@@ -457,7 +470,11 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
457
470
|
'page': {
|
|
458
471
|
'zh': '页码',
|
|
459
472
|
'en': 'Page'
|
|
460
|
-
}
|
|
473
|
+
},
|
|
474
|
+
'score_threshold': {
|
|
475
|
+
'zh': '分数阈值',
|
|
476
|
+
'en': 'Score Threshold'
|
|
477
|
+
},
|
|
461
478
|
}
|
|
462
479
|
|
|
463
480
|
# Update the UI components with localized labels
|
|
@@ -489,37 +506,53 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
489
506
|
gr.Markdown(f'### {locale_dict["model_prediction"][lang]}')
|
|
490
507
|
subset_select = gr.Dropdown(
|
|
491
508
|
label=locale_dict['select_subset'][lang], choices=[], show_label=True, interactive=True)
|
|
509
|
+
|
|
492
510
|
with gr.Row():
|
|
493
511
|
answer_mode_radio = gr.Radio(
|
|
494
512
|
label=locale_dict['answer_mode'][lang], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
answer_mode_counts = gr.Markdown('', label='Counts')
|
|
513
|
+
score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'][lang], interactive=True)
|
|
514
|
+
|
|
498
515
|
data_review_df = gr.State(None)
|
|
499
516
|
filtered_review_df = gr.State(None)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
'
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
517
|
+
|
|
518
|
+
# show statistics
|
|
519
|
+
with gr.Row(variant='panel'):
|
|
520
|
+
with gr.Column():
|
|
521
|
+
gr.Markdown('### *Counts*')
|
|
522
|
+
answer_mode_counts = gr.Markdown('')
|
|
523
|
+
with gr.Column():
|
|
524
|
+
page_number = gr.Number(
|
|
525
|
+
value=1, label=locale_dict['page'][lang], minimum=1, maximum=1, step=1, interactive=True)
|
|
526
|
+
|
|
527
|
+
# show data review table
|
|
528
|
+
with gr.Row(variant='panel'):
|
|
529
|
+
with gr.Column():
|
|
530
|
+
gr.Markdown('### *Score*')
|
|
531
|
+
score_text = gr.Markdown(
|
|
532
|
+
'', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
533
|
+
with gr.Column():
|
|
534
|
+
gr.Markdown('### *Normalized Score*')
|
|
535
|
+
nscore = gr.Markdown('', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS)
|
|
536
|
+
|
|
537
|
+
with gr.Row(variant='panel'):
|
|
538
|
+
with gr.Column():
|
|
539
|
+
gr.Markdown('### *Gold*')
|
|
540
|
+
gold_text = gr.Markdown(
|
|
541
|
+
'', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
542
|
+
with gr.Column():
|
|
543
|
+
gr.Markdown('### *Pred*')
|
|
544
|
+
pred_text = gr.Markdown(
|
|
545
|
+
'', elem_id='pred_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
546
|
+
|
|
547
|
+
with gr.Row(variant='panel'):
|
|
548
|
+
with gr.Column():
|
|
549
|
+
gr.Markdown('### *Input*')
|
|
550
|
+
input_text = gr.Markdown(
|
|
551
|
+
'', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
552
|
+
with gr.Column():
|
|
553
|
+
gr.Markdown('### *Generated*')
|
|
554
|
+
generated_text = gr.Markdown(
|
|
555
|
+
'', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
523
556
|
|
|
524
557
|
@report_name.change(
|
|
525
558
|
inputs=[sidebar.root_path, report_name],
|
|
@@ -561,15 +594,15 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
561
594
|
return data_review_df, 1
|
|
562
595
|
|
|
563
596
|
@gr.on(
|
|
564
|
-
triggers=[data_review_df.change, answer_mode_radio.change],
|
|
565
|
-
inputs=[data_review_df, answer_mode_radio],
|
|
597
|
+
triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
|
|
598
|
+
inputs=[data_review_df, answer_mode_radio, score_threshold],
|
|
566
599
|
outputs=[filtered_review_df, page_number, answer_mode_counts])
|
|
567
|
-
def filter_data(data_review_df, answer_mode):
|
|
600
|
+
def filter_data(data_review_df, answer_mode, score_threshold):
|
|
568
601
|
if data_review_df is None:
|
|
569
602
|
return None, gr.update(value=1, maximum=1), ''
|
|
570
603
|
|
|
571
604
|
all_count = len(data_review_df)
|
|
572
|
-
pass_df = data_review_df[data_review_df['NScore'] >=
|
|
605
|
+
pass_df = data_review_df[data_review_df['NScore'] >= score_threshold]
|
|
573
606
|
pass_count = len(pass_df)
|
|
574
607
|
fail_count = all_count - pass_count
|
|
575
608
|
|
|
@@ -578,7 +611,7 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
578
611
|
if answer_mode == 'Pass':
|
|
579
612
|
filtered_df = pass_df
|
|
580
613
|
elif answer_mode == 'Fail':
|
|
581
|
-
filtered_df = data_review_df[data_review_df['NScore'] <
|
|
614
|
+
filtered_df = data_review_df[data_review_df['NScore'] < score_threshold]
|
|
582
615
|
else:
|
|
583
616
|
filtered_df = data_review_df
|
|
584
617
|
|
|
@@ -588,13 +621,33 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
588
621
|
|
|
589
622
|
@gr.on(
|
|
590
623
|
triggers=[filtered_review_df.change, page_number.change],
|
|
591
|
-
inputs=[filtered_review_df, page_number],
|
|
592
|
-
outputs=[
|
|
593
|
-
def
|
|
594
|
-
if filtered_df is None:
|
|
595
|
-
return
|
|
596
|
-
|
|
597
|
-
|
|
624
|
+
inputs=[filtered_review_df, page_number, score_threshold],
|
|
625
|
+
outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore])
|
|
626
|
+
def update_table_components(filtered_df, page_number, score_threshold):
|
|
627
|
+
if filtered_df is None or len(filtered_df) == 0:
|
|
628
|
+
return '', '', '', '', '', ''
|
|
629
|
+
|
|
630
|
+
# Get single row data for the current page
|
|
631
|
+
start = (page_number - 1)
|
|
632
|
+
if start >= len(filtered_df):
|
|
633
|
+
return '', '', '', '', '', ''
|
|
634
|
+
|
|
635
|
+
row = filtered_df.iloc[start]
|
|
636
|
+
|
|
637
|
+
# Process the data for display
|
|
638
|
+
input_md = process_model_prediction(row['Input'])
|
|
639
|
+
generated_md = process_model_prediction(row['Generated'])
|
|
640
|
+
gold_md = process_model_prediction(row['Gold'])
|
|
641
|
+
pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
|
|
642
|
+
score_md = process_model_prediction(row['Score'])
|
|
643
|
+
nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
|
|
644
|
+
|
|
645
|
+
if nscore_val >= score_threshold:
|
|
646
|
+
nscore_val = f'<div style="background-color:rgb(45,104, 62); padding:10px;">{nscore_val}</div>'
|
|
647
|
+
else:
|
|
648
|
+
nscore_val = f'<div style="background-color:rgb(151, 31, 44); padding:10px;">{nscore_val}</div>'
|
|
649
|
+
|
|
650
|
+
return input_md, generated_md, gold_md, pred_md, score_md, nscore_val
|
|
598
651
|
|
|
599
652
|
return SingleModelComponents(report_name=report_name)
|
|
600
653
|
|
|
@@ -696,16 +749,13 @@ def create_app(args: argparse.Namespace):
|
|
|
696
749
|
text = '<' if new_visible else '>'
|
|
697
750
|
return gr.update(visible=new_visible), new_visible, gr.update(value=text)
|
|
698
751
|
|
|
699
|
-
demo.launch(
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
parser.add_argument('--debug', action='store_true', help='Debug the app.')
|
|
707
|
-
parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
|
|
708
|
-
parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
|
|
752
|
+
demo.launch(
|
|
753
|
+
share=args.share,
|
|
754
|
+
server_name=args.server_name,
|
|
755
|
+
server_port=args.server_port,
|
|
756
|
+
debug=args.debug,
|
|
757
|
+
allowed_paths=args.allowed_paths,
|
|
758
|
+
)
|
|
709
759
|
|
|
710
760
|
|
|
711
761
|
if __name__ == '__main__':
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def add_argument(parser: argparse.ArgumentParser):
|
|
5
|
+
parser.add_argument('--share', action='store_true', help='Share the app.')
|
|
6
|
+
parser.add_argument('--server-name', type=str, default='0.0.0.0', help='The server name.')
|
|
7
|
+
parser.add_argument('--server-port', type=int, default=None, help='The server port.')
|
|
8
|
+
parser.add_argument('--debug', action='store_true', help='Debug the app.')
|
|
9
|
+
parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
|
|
10
|
+
parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
|
|
11
|
+
parser.add_argument('--allowed-paths', nargs='+', default=['/'], help='The outputs dir.')
|
evalscope/report/generator.py
CHANGED
|
@@ -48,7 +48,7 @@ class ReportGenerator:
|
|
|
48
48
|
df = flatten_subset()
|
|
49
49
|
|
|
50
50
|
metrics_list = []
|
|
51
|
-
for metric_name, group_metric in df.groupby('metric_name'):
|
|
51
|
+
for metric_name, group_metric in df.groupby('metric_name', sort=False):
|
|
52
52
|
categories = []
|
|
53
53
|
for category_name, group_category in group_metric.groupby('categories'):
|
|
54
54
|
subsets = []
|
evalscope/run.py
CHANGED
|
@@ -58,10 +58,17 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
58
58
|
|
|
59
59
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
60
60
|
|
|
61
|
+
# Unify the output directory structure
|
|
61
62
|
if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
|
|
62
63
|
task_cfg.eval_config['time_str'] = run_time
|
|
63
64
|
elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
64
65
|
task_cfg.eval_config['work_dir'] = task_cfg.work_dir
|
|
66
|
+
elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
|
|
67
|
+
from evalscope.backend.rag_eval import Tools
|
|
68
|
+
if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
|
|
69
|
+
task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
|
|
70
|
+
elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
|
|
71
|
+
task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
|
|
65
72
|
return outputs
|
|
66
73
|
|
|
67
74
|
|
|
@@ -146,10 +153,10 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
146
153
|
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
147
154
|
return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
|
|
148
155
|
|
|
149
|
-
# Initialize
|
|
150
|
-
model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
|
|
151
|
-
# Initialize data adapter
|
|
156
|
+
# Initialize data adapter first to update config
|
|
152
157
|
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
158
|
+
# Initialize model adapter
|
|
159
|
+
model_adapter = initialize_model_adapter(task_cfg, data_adapter, base_model)
|
|
153
160
|
|
|
154
161
|
# update task_cfg.dataset_args
|
|
155
162
|
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
evalscope/summarizer.py
CHANGED
|
@@ -105,7 +105,8 @@ class Summarizer:
|
|
|
105
105
|
summary_res: dict = csv_to_list(summary_file_path)[0]
|
|
106
106
|
elif summary_file_path.endswith('json'):
|
|
107
107
|
summary_res: dict = json_to_dict(summary_file_path)
|
|
108
|
-
|
|
108
|
+
base_name = os.path.basename(summary_file_path)
|
|
109
|
+
file_name = os.path.splitext(base_name)[0]
|
|
109
110
|
final_res_list.append({file_name: summary_res})
|
|
110
111
|
|
|
111
112
|
elif eval_backend == EvalBackend.THIRD_PARTY:
|
|
@@ -357,7 +357,7 @@ judge_config = dict(
|
|
|
357
357
|
)
|
|
358
358
|
|
|
359
359
|
distill_qwen_config = dict(
|
|
360
|
-
report_path = '
|
|
360
|
+
report_path = '../eval-scope/outputs/20250218_180219',
|
|
361
361
|
model_name = 'DeepSeek-R1-Distill-Qwen-7B',
|
|
362
362
|
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
|
363
363
|
dataset_name = 'math_500',
|
|
@@ -367,7 +367,7 @@ distill_qwen_config = dict(
|
|
|
367
367
|
)
|
|
368
368
|
|
|
369
369
|
math_qwen_config = dict(
|
|
370
|
-
report_path = '
|
|
370
|
+
report_path = '../eval-scope/outputs/20250219_202358',
|
|
371
371
|
model_name = 'Qwen2.5-Math-7B-Instruct',
|
|
372
372
|
tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
|
|
373
373
|
dataset_name = 'math_500',
|
|
@@ -377,7 +377,7 @@ math_qwen_config = dict(
|
|
|
377
377
|
)
|
|
378
378
|
|
|
379
379
|
r1_config = dict(
|
|
380
|
-
report_path = '
|
|
380
|
+
report_path = '../eval-scope/outputs/20250307_000404',
|
|
381
381
|
model_name = 'deepseek-r1',
|
|
382
382
|
tokenizer_path = 'deepseek-ai/DeepSeek-R1',
|
|
383
383
|
dataset_name = 'math_500',
|
|
@@ -387,7 +387,7 @@ r1_config = dict(
|
|
|
387
387
|
)
|
|
388
388
|
|
|
389
389
|
qwq_preview_config = dict(
|
|
390
|
-
report_path = '
|
|
390
|
+
report_path = '../eval-scope/outputs/20250221_105911',
|
|
391
391
|
model_name = 'qwq-32b-preview',
|
|
392
392
|
tokenizer_path = 'Qwen/QwQ-32B-Preview',
|
|
393
393
|
dataset_name = 'math_500',
|
|
@@ -397,7 +397,7 @@ qwq_preview_config = dict(
|
|
|
397
397
|
)
|
|
398
398
|
|
|
399
399
|
qwq_config = dict(
|
|
400
|
-
report_path = '
|
|
400
|
+
report_path = '../eval-scope/outputs/20250306_181550',
|
|
401
401
|
model_name = 'QwQ-32B',
|
|
402
402
|
tokenizer_path = 'Qwen/QwQ-32B',
|
|
403
403
|
dataset_name = 'math_500',
|
|
@@ -407,7 +407,7 @@ qwq_config = dict(
|
|
|
407
407
|
)
|
|
408
408
|
|
|
409
409
|
distill_qwen_32b = dict(
|
|
410
|
-
report_path = '
|
|
410
|
+
report_path = '../eval-scope/outputs/20250306_235951',
|
|
411
411
|
model_name = 'deepseek-r1-distill-qwen-32b',
|
|
412
412
|
tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
|
|
413
413
|
dataset_name = 'math_500',
|
|
@@ -416,14 +416,26 @@ distill_qwen_32b = dict(
|
|
|
416
416
|
judge_config=judge_config
|
|
417
417
|
)
|
|
418
418
|
|
|
419
|
+
qwen3_32b_think = dict(
|
|
420
|
+
report_path = '../eval-scope/outputs/20250428_151817',
|
|
421
|
+
model_name = 'Qwen3-32B',
|
|
422
|
+
tokenizer_path = 'Qwen/Qwen3-32B',
|
|
423
|
+
dataset_name = 'math_500',
|
|
424
|
+
subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
425
|
+
split_strategies='separator',
|
|
426
|
+
judge_config=judge_config
|
|
427
|
+
)
|
|
428
|
+
|
|
419
429
|
if __name__ == '__main__':
|
|
420
430
|
# run_task(distill_qwen_config, count=80)
|
|
421
431
|
# run_task(math_qwen_config)
|
|
422
432
|
# run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
|
|
423
433
|
# run_task(r1_config, max_tokens=20000, count=200, workers=128)
|
|
424
434
|
# run_task(qwq_config, max_tokens=20000, count=200, workers=128)
|
|
435
|
+
run_task(qwen3_32b_think, max_tokens=20000, count=200, workers=128)
|
|
425
436
|
# run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
|
|
426
437
|
|
|
427
438
|
# combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
|
|
428
439
|
# combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
|
|
429
|
-
combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
|
|
440
|
+
# combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
|
|
441
|
+
combine_results([qwq_config, r1_config, distill_qwen_32b, qwen3_32b_think], output_path='outputs/model_comparison_metrics_4models.png')
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -64,10 +64,10 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
|
64
64
|
|
|
65
65
|
class ChatCompletionResponse(BaseModel):
|
|
66
66
|
model: str
|
|
67
|
-
object: Literal['chat.completion', 'chat.completion.chunk']
|
|
67
|
+
object: Literal['chat.completion', 'chat.completion.chunk', 'images.generations']
|
|
68
68
|
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
|
|
69
69
|
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
|
|
70
|
-
usage: Optional[Usage]
|
|
70
|
+
usage: Optional[Usage] = None
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
class TextCompletionRequest(BaseModel):
|