evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/report/__init__.py
CHANGED
|
@@ -4,9 +4,16 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .combinator import
|
|
7
|
+
from .combinator import (
|
|
8
|
+
gen_table,
|
|
9
|
+
get_data_frame,
|
|
10
|
+
get_report_list,
|
|
11
|
+
percentage_weighted_average_from_subsets,
|
|
12
|
+
unweighted_average_from_subsets,
|
|
13
|
+
weighted_average_from_subsets,
|
|
14
|
+
)
|
|
8
15
|
from .generator import ReportGenerator
|
|
9
|
-
from .report import Category, Report, ReportKey, Subset
|
|
16
|
+
from .report import Category, Metric, Report, ReportKey, Subset
|
|
10
17
|
|
|
11
18
|
else:
|
|
12
19
|
_import_structure = {
|
|
@@ -14,7 +21,9 @@ else:
|
|
|
14
21
|
'gen_table',
|
|
15
22
|
'get_data_frame',
|
|
16
23
|
'get_report_list',
|
|
17
|
-
'
|
|
24
|
+
'weighted_average_from_subsets',
|
|
25
|
+
'unweighted_average_from_subsets',
|
|
26
|
+
'percentage_weighted_average_from_subsets',
|
|
18
27
|
],
|
|
19
28
|
'generator': [
|
|
20
29
|
'ReportGenerator',
|
|
@@ -24,6 +33,7 @@ else:
|
|
|
24
33
|
'Report',
|
|
25
34
|
'ReportKey',
|
|
26
35
|
'Subset',
|
|
36
|
+
'Metric',
|
|
27
37
|
],
|
|
28
38
|
}
|
|
29
39
|
|
evalscope/report/combinator.py
CHANGED
|
@@ -4,9 +4,9 @@ import glob
|
|
|
4
4
|
import os
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from tabulate import tabulate
|
|
7
|
-
from typing import List, Tuple
|
|
7
|
+
from typing import Dict, List, Tuple, Union
|
|
8
8
|
|
|
9
|
-
from evalscope.report.report import Report
|
|
9
|
+
from evalscope.report.report import Report, Subset
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -88,26 +88,97 @@ def gen_table(
|
|
|
88
88
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
89
89
|
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
91
|
+
def weighted_average_from_subsets(
|
|
92
|
+
subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
|
|
93
|
+
) -> Subset:
|
|
94
|
+
"""Calculate weighted average for given subsets.
|
|
94
95
|
|
|
95
|
-
|
|
96
|
-
|
|
96
|
+
Args:
|
|
97
|
+
subset_names (List[str]): List of subset names to include in the average.
|
|
98
|
+
subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
|
|
99
|
+
new_name (str): Name for the resulting Subset object.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Subset: A new Subset object with weighted average score
|
|
103
|
+
"""
|
|
104
|
+
total_score = 0
|
|
105
|
+
total_count = 0
|
|
106
|
+
for name in subset_names:
|
|
107
|
+
if name in subset_dict:
|
|
108
|
+
subset = subset_dict[name]
|
|
109
|
+
total_score += subset.score * subset.num
|
|
110
|
+
total_count += subset.num
|
|
111
|
+
|
|
112
|
+
weighted_avg = total_score / total_count if total_count > 0 else 0
|
|
113
|
+
return Subset(name=new_name, score=weighted_avg, num=total_count)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def unweighted_average_from_subsets(
|
|
117
|
+
subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
|
|
118
|
+
) -> Subset:
|
|
119
|
+
"""Calculate unweighted average for given subsets.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
subset_names (List[str]): List of subset names to include in the average.
|
|
123
|
+
subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
|
|
124
|
+
new_name (str): Name for the resulting Subset object.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Subset: A new Subset object with unweighted average score
|
|
128
|
+
"""
|
|
129
|
+
scores = []
|
|
130
|
+
total_count = 0
|
|
131
|
+
for name in subset_names:
|
|
132
|
+
if name in subset_dict:
|
|
133
|
+
subset = subset_dict[name]
|
|
134
|
+
scores.append(subset.score)
|
|
135
|
+
total_count += subset.num
|
|
136
|
+
|
|
137
|
+
unweighted_avg = sum(scores) / len(scores) if scores else 0
|
|
138
|
+
return Subset(name=new_name, score=unweighted_avg, num=total_count)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def percentage_weighted_average_from_subsets(
|
|
142
|
+
subset_names: List[str], subset_dict: Dict[str, Subset], weights: List[float], new_name: str = ''
|
|
143
|
+
) -> Subset:
|
|
144
|
+
"""Calculate percentage weighted average for given subsets.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
subset_names (List[str]): List of subset names to include in the average.
|
|
148
|
+
subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
|
|
149
|
+
weights (List[float]): The weight for each corresponding accuracy entry.
|
|
150
|
+
Can sum to any positive value – they will be normalised internally.
|
|
151
|
+
new_name (str): Name for the resulting Subset object.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Subset: A new Subset object with percentage weighted average score.
|
|
155
|
+
"""
|
|
156
|
+
assert len(subset_names) == len(weights), \
|
|
157
|
+
'The number of subset names must match the number of weights.'
|
|
158
|
+
|
|
159
|
+
valid_subsets = []
|
|
160
|
+
valid_weights = []
|
|
161
|
+
total_count = 0
|
|
162
|
+
|
|
163
|
+
for name, weight in zip(subset_names, weights):
|
|
164
|
+
if name in subset_dict:
|
|
165
|
+
subset = subset_dict[name]
|
|
166
|
+
valid_subsets.append(subset)
|
|
167
|
+
valid_weights.append(weight)
|
|
168
|
+
total_count += subset.num
|
|
169
|
+
|
|
170
|
+
if not valid_subsets:
|
|
171
|
+
return Subset(name=new_name, score=0, num=0)
|
|
97
172
|
|
|
173
|
+
weight_sum = sum(valid_weights)
|
|
174
|
+
assert weight_sum > 0, \
|
|
175
|
+
f"Sum of weights for percentage_weighted_average_from_subsets for '{new_name}' is not positive."
|
|
98
176
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
# report_dir_2 = './outputs/20250107_204445/reports'
|
|
177
|
+
# Normalise weights so that they sum to 1.0
|
|
178
|
+
weights_norm = [w / weight_sum for w in valid_weights]
|
|
102
179
|
|
|
103
|
-
|
|
104
|
-
|
|
180
|
+
total_score = 0
|
|
181
|
+
for subset, weight in zip(valid_subsets, weights_norm):
|
|
182
|
+
total_score += subset.score * weight
|
|
105
183
|
|
|
106
|
-
|
|
107
|
-
# +--------------------------+-------------------+-------------+
|
|
108
|
-
# | Model | CompetitionMath | GSM8K |
|
|
109
|
-
# +==========================+===================+=============+
|
|
110
|
-
# | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
|
|
111
|
-
# +--------------------------+-------------------+-------------+
|
|
112
|
-
# | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
|
|
113
|
-
# +--------------------------+-------------------+-------------+
|
|
184
|
+
return Subset(name=new_name, score=total_score, num=total_count)
|
evalscope/report/generator.py
CHANGED
|
@@ -8,105 +8,26 @@ from evalscope.report.report import *
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from evalscope.api.benchmark import DataAdapter
|
|
10
10
|
from evalscope.api.metric import AggScore
|
|
11
|
-
from evalscope.benchmarks import DataAdapter as OldDataAdapter
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class ReportGenerator:
|
|
15
14
|
|
|
16
15
|
@staticmethod
|
|
17
|
-
def
|
|
18
|
-
"""
|
|
19
|
-
Generate a report for a specific dataset based on provided subset scores.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
|
|
23
|
-
{
|
|
24
|
-
'subset_name': [
|
|
25
|
-
{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
|
|
26
|
-
{'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
|
|
27
|
-
],
|
|
28
|
-
...
|
|
29
|
-
}
|
|
30
|
-
report_name (str): The name of the report to generate.
|
|
31
|
-
data_adapter (DataAdapter): An adapter object for data handling.
|
|
32
|
-
|
|
33
|
-
Returns:
|
|
34
|
-
Report: A structured report object containing metrics, categories, and subsets.
|
|
35
|
-
|
|
36
|
-
>>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
|
|
37
|
-
""" # noqa: E501
|
|
38
|
-
|
|
39
|
-
dataset_name = data_adapter.name
|
|
40
|
-
category_map = data_adapter.category_map
|
|
41
|
-
report_name = f'{model_name}@{dataset_name}'
|
|
42
|
-
|
|
43
|
-
def flatten_subset() -> DataFrame:
|
|
44
|
-
"""
|
|
45
|
-
Flatten subset score map to a DataFrame.
|
|
46
|
-
|
|
47
|
-
Example:
|
|
48
|
-
name score num categories metric_name
|
|
49
|
-
0 ARC-Easy 0.5 2 [default] AverageAccuracy
|
|
50
|
-
1 ARC-Challenge 0.5 2 [default] AverageAccuracy
|
|
51
|
-
"""
|
|
52
|
-
subsets = []
|
|
53
|
-
for subset_name, scores in subset_score_map.items():
|
|
54
|
-
for score_item in scores:
|
|
55
|
-
categories = category_map.get(subset_name, ['default'])
|
|
56
|
-
if isinstance(categories, str):
|
|
57
|
-
categories = [categories]
|
|
58
|
-
subsets.append(
|
|
59
|
-
dict(
|
|
60
|
-
name=subset_name,
|
|
61
|
-
score=score_item['score'],
|
|
62
|
-
num=score_item['num'],
|
|
63
|
-
metric_name=score_item['metric_name'],
|
|
64
|
-
categories=tuple(categories)
|
|
65
|
-
)
|
|
66
|
-
)
|
|
67
|
-
df = pd.DataFrame(subsets)
|
|
68
|
-
return df
|
|
69
|
-
|
|
70
|
-
df = flatten_subset()
|
|
71
|
-
|
|
16
|
+
def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
|
|
72
17
|
metrics_list = []
|
|
73
|
-
for metric_name, group_metric in df.groupby('
|
|
18
|
+
for metric_name, group_metric in df.groupby('metric', sort=False):
|
|
74
19
|
categories = []
|
|
75
20
|
for category_name, group_category in group_metric.groupby('categories'):
|
|
76
21
|
subsets = []
|
|
77
|
-
for
|
|
78
|
-
|
|
79
|
-
|
|
22
|
+
for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name',
|
|
23
|
+
'subset_name']):
|
|
24
|
+
avg_score = group_subset['score'].mean()
|
|
25
|
+
num = group_subset['score'].count()
|
|
26
|
+
subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
|
|
80
27
|
categories.append(Category(name=category_name, subsets=subsets))
|
|
81
|
-
|
|
82
28
|
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
83
|
-
|
|
84
|
-
report = Report(
|
|
85
|
-
name=report_name,
|
|
86
|
-
metrics=metrics_list,
|
|
87
|
-
dataset_name=dataset_name,
|
|
88
|
-
model_name=model_name,
|
|
89
|
-
dataset_description=data_adapter.description,
|
|
90
|
-
dataset_pretty_name=data_adapter.pretty_name
|
|
91
|
-
)
|
|
92
|
-
return report
|
|
93
|
-
|
|
94
|
-
@staticmethod
|
|
95
|
-
def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
|
|
96
|
-
categories = []
|
|
97
|
-
for category_name, group_category in df.groupby('categories'):
|
|
98
|
-
subsets = []
|
|
99
|
-
for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
|
|
100
|
-
avg_score = group_subset['score'].mean()
|
|
101
|
-
num = group_subset['score'].count()
|
|
102
|
-
subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
|
|
103
|
-
|
|
104
|
-
categories.append(Category(name=category_name, subsets=subsets))
|
|
105
29
|
return Report(
|
|
106
|
-
name=DataCollection.NAME,
|
|
107
|
-
metrics=[Metric(name='Average', categories=categories)],
|
|
108
|
-
dataset_name=all_dataset_name,
|
|
109
|
-
model_name=model_name
|
|
30
|
+
name=DataCollection.NAME, metrics=metrics_list, dataset_name=all_dataset_name, model_name=model_name
|
|
110
31
|
)
|
|
111
32
|
|
|
112
33
|
@staticmethod
|
evalscope/report/report.py
CHANGED
|
@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分
|
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
25
|
+
def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
|
|
26
26
|
"""
|
|
27
27
|
Normalize score.
|
|
28
28
|
|
|
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
|
|
|
37
37
|
score = round(score, keep_num)
|
|
38
38
|
elif isinstance(score, dict):
|
|
39
39
|
score = {k: round(v, keep_num) for k, v in score.items()}
|
|
40
|
+
elif isinstance(score, int):
|
|
41
|
+
score = float(score)
|
|
40
42
|
else:
|
|
41
43
|
logger.warning(f'Unknown score type: {type(score)}')
|
|
42
|
-
|
|
43
44
|
return score
|
|
44
45
|
|
|
45
46
|
|
|
@@ -103,6 +104,7 @@ class ReportKey:
|
|
|
103
104
|
subset_name = 'Subset'
|
|
104
105
|
num = 'Num'
|
|
105
106
|
score = 'Score'
|
|
107
|
+
overall_score = 'OVERALL'
|
|
106
108
|
|
|
107
109
|
|
|
108
110
|
@dataclass
|
|
@@ -181,12 +183,14 @@ class Report:
|
|
|
181
183
|
table[ReportKey.num].append(subset.num)
|
|
182
184
|
table[ReportKey.score].append(subset.score)
|
|
183
185
|
# add overall metric when there are multiple subsets
|
|
184
|
-
if metric_count > 1 and add_overall_metric
|
|
186
|
+
if metric_count > 1 and add_overall_metric and (
|
|
187
|
+
ReportKey.overall_score not in table[ReportKey.subset_name]
|
|
188
|
+
):
|
|
185
189
|
table[ReportKey.model_name].append(self.model_name)
|
|
186
190
|
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
187
191
|
table[ReportKey.metric_name].append(metric.name)
|
|
188
192
|
table[ReportKey.category_name].append(('-', ))
|
|
189
|
-
table[ReportKey.subset_name].append(
|
|
193
|
+
table[ReportKey.subset_name].append(ReportKey.overall_score)
|
|
190
194
|
table[ReportKey.num].append(metric.num)
|
|
191
195
|
table[ReportKey.score].append(metric.score)
|
|
192
196
|
# NOTE: only flatten metrics if needed, use the first metric by default
|
evalscope/run.py
CHANGED
|
@@ -38,6 +38,7 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
38
38
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
39
39
|
result = run_non_native_backend(task_cfg, outputs)
|
|
40
40
|
else:
|
|
41
|
+
logger.info('Running with native backend')
|
|
41
42
|
result = evaluate_model(task_cfg, outputs)
|
|
42
43
|
|
|
43
44
|
logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
|
|
@@ -94,12 +95,15 @@ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> d
|
|
|
94
95
|
def get_backend_manager_class(eval_backend: EvalBackend):
|
|
95
96
|
"""Get the backend manager class based on the evaluation backend."""
|
|
96
97
|
if eval_backend == EvalBackend.OPEN_COMPASS:
|
|
98
|
+
logger.info('Using OpenCompassBackendManager')
|
|
97
99
|
from evalscope.backend.opencompass import OpenCompassBackendManager
|
|
98
100
|
return OpenCompassBackendManager
|
|
99
101
|
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
102
|
+
logger.info('Using VLMEvalKitBackendManager')
|
|
100
103
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
101
104
|
return VLMEvalKitBackendManager
|
|
102
105
|
elif eval_backend == EvalBackend.RAG_EVAL:
|
|
106
|
+
logger.info('Using RAGEvalBackendManager')
|
|
103
107
|
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
104
108
|
return RAGEvalBackendManager
|
|
105
109
|
elif eval_backend == EvalBackend.THIRD_PARTY:
|
|
@@ -131,8 +135,9 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
131
135
|
)
|
|
132
136
|
evaluators.append(evaluator)
|
|
133
137
|
|
|
134
|
-
# Update task_config.dataset_args with benchmark metadata
|
|
135
|
-
|
|
138
|
+
# Update task_config.dataset_args with benchmark metadata, except for DataCollection
|
|
139
|
+
if dataset_name != DataCollection.NAME:
|
|
140
|
+
task_config.dataset_args[dataset_name] = benchmark.to_dict()
|
|
136
141
|
|
|
137
142
|
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
138
143
|
task_config.dump_yaml(outputs.configs_dir)
|
|
@@ -149,17 +154,20 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
149
154
|
logger.info(f'Overall report table: \n{report_table} \n')
|
|
150
155
|
except Exception:
|
|
151
156
|
logger.error('Failed to generate report table.')
|
|
152
|
-
|
|
153
157
|
# Clean up
|
|
154
158
|
if model is not None:
|
|
155
159
|
import gc
|
|
156
|
-
import torch
|
|
157
160
|
|
|
158
161
|
del model
|
|
159
162
|
del evaluators
|
|
160
|
-
torch.cuda.empty_cache()
|
|
161
163
|
gc.collect()
|
|
162
164
|
|
|
165
|
+
from evalscope.utils.import_utils import check_import
|
|
166
|
+
if check_import('torch', raise_warning=False):
|
|
167
|
+
import torch
|
|
168
|
+
if torch.cuda.is_available():
|
|
169
|
+
torch.cuda.empty_cache()
|
|
170
|
+
|
|
163
171
|
return eval_results
|
|
164
172
|
|
|
165
173
|
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
|
-
import torch
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from functools import partial
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
@@ -95,6 +94,7 @@ class TextCompletionResponse(BaseModel):
|
|
|
95
94
|
class ChatService:
|
|
96
95
|
|
|
97
96
|
def __init__(self, model_path, attn_implementation):
|
|
97
|
+
import torch
|
|
98
98
|
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
99
99
|
from transformers import TextIteratorStreamer
|
|
100
100
|
|