PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/report/__init__.py CHANGED Viewed

@@ -4,9 +4,16 @@ from typing import TYPE_CHECKING
 from evalscope.utils.import_utils import _LazyModule
 if TYPE_CHECKING:
-    from .combinator import gen_table, get_data_frame, get_report_list
+    from .combinator import (
+        gen_table,
+        get_data_frame,
+        get_report_list,
+        percentage_weighted_average_from_subsets,
+        unweighted_average_from_subsets,
+        weighted_average_from_subsets,
+    )
     from .generator import ReportGenerator
-    from .report import Category, Report, ReportKey, Subset
+    from .report import Category, Metric, Report, ReportKey, Subset
 else:
     _import_structure = {
@@ -14,7 +21,9 @@ else:
             'gen_table',
             'get_data_frame',
             'get_report_list',
-            'gen_report_table',
+            'weighted_average_from_subsets',
+            'unweighted_average_from_subsets',
+            'percentage_weighted_average_from_subsets',
         ],
         'generator': [
             'ReportGenerator',
@@ -24,6 +33,7 @@ else:
             'Report',
             'ReportKey',
             'Subset',
+            'Metric',
         ],
     }

evalscope/report/combinator.py CHANGED Viewed

@@ -4,9 +4,9 @@ import glob
 import os
 import pandas as pd
 from tabulate import tabulate
-from typing import List, Tuple
+from typing import Dict, List, Tuple, Union
-from evalscope.report.report import Report
+from evalscope.report.report import Report, Subset
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -88,26 +88,97 @@ def gen_table(
     return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
-class ReportsRecorder:
-    COMMON_DATASET_PATH = []
-    CUSTOM_DATASET_PATH = []
+def weighted_average_from_subsets(
+    subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
+) -> Subset:
+    """Calculate weighted average for given subsets.
-    def __init__(self, oss_url: str = '', endpoint: str = ''):
-        pass
+    Args:
+        subset_names (List[str]): List of subset names to include in the average.
+        subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
+        new_name (str): Name for the resulting Subset object.
+    Returns:
+        Subset: A new Subset object with weighted average score
+    """
+    total_score = 0
+    total_count = 0
+    for name in subset_names:
+        if name in subset_dict:
+            subset = subset_dict[name]
+            total_score += subset.score * subset.num
+            total_count += subset.num
+    weighted_avg = total_score / total_count if total_count > 0 else 0
+    return Subset(name=new_name, score=weighted_avg, num=total_count)
+def unweighted_average_from_subsets(
+    subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
+) -> Subset:
+    """Calculate unweighted average for given subsets.
+    Args:
+        subset_names (List[str]): List of subset names to include in the average.
+        subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
+        new_name (str): Name for the resulting Subset object.
+    Returns:
+        Subset: A new Subset object with unweighted average score
+    """
+    scores = []
+    total_count = 0
+    for name in subset_names:
+        if name in subset_dict:
+            subset = subset_dict[name]
+            scores.append(subset.score)
+            total_count += subset.num
+    unweighted_avg = sum(scores) / len(scores) if scores else 0
+    return Subset(name=new_name, score=unweighted_avg, num=total_count)
+def percentage_weighted_average_from_subsets(
+    subset_names: List[str], subset_dict: Dict[str, Subset], weights: List[float], new_name: str = ''
+) -> Subset:
+    """Calculate percentage weighted average for given subsets.
+    Args:
+        subset_names (List[str]): List of subset names to include in the average.
+        subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
+        weights (List[float]): The weight for each corresponding accuracy entry.
+            Can sum to any positive value – they will be normalised internally.
+        new_name (str): Name for the resulting Subset object.
+    Returns:
+        Subset: A new Subset object with percentage weighted average score.
+    """
+    assert len(subset_names) == len(weights), \
+        'The number of subset names must match the number of weights.'
+    valid_subsets = []
+    valid_weights = []
+    total_count = 0
+    for name, weight in zip(subset_names, weights):
+        if name in subset_dict:
+            subset = subset_dict[name]
+            valid_subsets.append(subset)
+            valid_weights.append(weight)
+            total_count += subset.num
+    if not valid_subsets:
+        return Subset(name=new_name, score=0, num=0)
+    weight_sum = sum(valid_weights)
+    assert weight_sum > 0, \
+        f"Sum of weights for percentage_weighted_average_from_subsets for '{new_name}' is not positive."
-if __name__ == '__main__':
-    report_dir_1 = './outputs/20250117_151926'
-    # report_dir_2 = './outputs/20250107_204445/reports'
+    # Normalise weights so that they sum to 1.0
+    weights_norm = [w / weight_sum for w in valid_weights]
-    report_table = gen_table(reports_path_list=[report_dir_1])
-    print(report_table)
+    total_score = 0
+    for subset, weight in zip(valid_subsets, weights_norm):
+        total_score += subset.score * weight
-    # ALL VALUES ONLY FOR EXAMPLE
-    # +--------------------------+-------------------+-------------+
-    # | Model                    | CompetitionMath   | GSM8K       |
-    # +==========================+===================+=============+
-    # | ZhipuAI_chatglm2-6b-base | 25.0 (acc)        | 30.50 (acc) |
-    # +--------------------------+-------------------+-------------+
-    # | ZhipuAI_chatglm2-6b      | 30.5 (acc)        | 40.50 (acc) |
-    # +--------------------------+-------------------+-------------+
+    return Subset(name=new_name, score=total_score, num=total_count)

evalscope/report/generator.py CHANGED Viewed

@@ -8,105 +8,26 @@ from evalscope.report.report import *
 if TYPE_CHECKING:
     from evalscope.api.benchmark import DataAdapter
     from evalscope.api.metric import AggScore
-    from evalscope.benchmarks import DataAdapter as OldDataAdapter
 class ReportGenerator:
     @staticmethod
-    def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'OldDataAdapter', **kwargs) -> Report:
-        """
-        Generate a report for a specific dataset based on provided subset scores.
-        Args:
-            subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
-                    {
-                        'subset_name': [
-                            {'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
-                            {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
-                        ],
-                        ...
-                    }
-            report_name (str): The name of the report to generate.
-            data_adapter (DataAdapter): An adapter object for data handling.
-        Returns:
-            Report: A structured report object containing metrics, categories, and subsets.
-            >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
-        """  # noqa: E501
-        dataset_name = data_adapter.name
-        category_map = data_adapter.category_map
-        report_name = f'{model_name}@{dataset_name}'
-        def flatten_subset() -> DataFrame:
-            """
-            Flatten subset score map to a DataFrame.
-            Example:
-                        name  score  num   categories      metric_name
-            0       ARC-Easy    0.5    2    [default]  AverageAccuracy
-            1  ARC-Challenge    0.5    2    [default]  AverageAccuracy
-            """
-            subsets = []
-            for subset_name, scores in subset_score_map.items():
-                for score_item in scores:
-                    categories = category_map.get(subset_name, ['default'])
-                    if isinstance(categories, str):
-                        categories = [categories]
-                    subsets.append(
-                        dict(
-                            name=subset_name,
-                            score=score_item['score'],
-                            num=score_item['num'],
-                            metric_name=score_item['metric_name'],
-                            categories=tuple(categories)
-                        )
-                    )
-            df = pd.DataFrame(subsets)
-            return df
-        df = flatten_subset()
+    def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
         metrics_list = []
-        for metric_name, group_metric in df.groupby('metric_name', sort=False):
+        for metric_name, group_metric in df.groupby('metric', sort=False):
             categories = []
             for category_name, group_category in group_metric.groupby('categories'):
                 subsets = []
-                for _, row in group_category.iterrows():
-                    subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
+                for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name',
+                                                                                         'subset_name']):
+                    avg_score = group_subset['score'].mean()
+                    num = group_subset['score'].count()
+                    subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
                 categories.append(Category(name=category_name, subsets=subsets))
             metrics_list.append(Metric(name=metric_name, categories=categories))
-        report = Report(
-            name=report_name,
-            metrics=metrics_list,
-            dataset_name=dataset_name,
-            model_name=model_name,
-            dataset_description=data_adapter.description,
-            dataset_pretty_name=data_adapter.pretty_name
-        )
-        return report
-    @staticmethod
-    def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
-        categories = []
-        for category_name, group_category in df.groupby('categories'):
-            subsets = []
-            for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
-                avg_score = group_subset['score'].mean()
-                num = group_subset['score'].count()
-                subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
-            categories.append(Category(name=category_name, subsets=subsets))
         return Report(
-            name=DataCollection.NAME,
-            metrics=[Metric(name='Average', categories=categories)],
-            dataset_name=all_dataset_name,
-            model_name=model_name
+            name=DataCollection.NAME, metrics=metrics_list, dataset_name=all_dataset_name, model_name=model_name
         )
     @staticmethod

evalscope/report/report.py CHANGED Viewed

@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果，输出分
 """
-def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
+def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
     """
     Normalize score.
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
         score = round(score, keep_num)
     elif isinstance(score, dict):
         score = {k: round(v, keep_num) for k, v in score.items()}
+    elif isinstance(score, int):
+        score = float(score)
     else:
         logger.warning(f'Unknown score type: {type(score)}')
     return score
@@ -103,6 +104,7 @@ class ReportKey:
     subset_name = 'Subset'
     num = 'Num'
     score = 'Score'
+    overall_score = 'OVERALL'
 @dataclass
@@ -181,12 +183,14 @@ class Report:
                     table[ReportKey.num].append(subset.num)
                     table[ReportKey.score].append(subset.score)
             # add overall metric when there are multiple subsets
-            if metric_count > 1 and add_overall_metric:
+            if metric_count > 1 and add_overall_metric and (
+                ReportKey.overall_score not in table[ReportKey.subset_name]
+            ):
                 table[ReportKey.model_name].append(self.model_name)
                 table[ReportKey.dataset_name].append(self.dataset_name)
                 table[ReportKey.metric_name].append(metric.name)
                 table[ReportKey.category_name].append(('-', ))
-                table[ReportKey.subset_name].append('OVERALL')
+                table[ReportKey.subset_name].append(ReportKey.overall_score)
                 table[ReportKey.num].append(metric.num)
                 table[ReportKey.score].append(metric.score)
             # NOTE: only flatten metrics if needed, use the first metric by default

evalscope/run.py CHANGED Viewed

@@ -38,6 +38,7 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
     if task_cfg.eval_backend != EvalBackend.NATIVE:
         result = run_non_native_backend(task_cfg, outputs)
     else:
+        logger.info('Running with native backend')
         result = evaluate_model(task_cfg, outputs)
         logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
@@ -94,12 +95,15 @@ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> d
 def get_backend_manager_class(eval_backend: EvalBackend):
     """Get the backend manager class based on the evaluation backend."""
     if eval_backend == EvalBackend.OPEN_COMPASS:
+        logger.info('Using OpenCompassBackendManager')
         from evalscope.backend.opencompass import OpenCompassBackendManager
         return OpenCompassBackendManager
     elif eval_backend == EvalBackend.VLM_EVAL_KIT:
+        logger.info('Using VLMEvalKitBackendManager')
         from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
         return VLMEvalKitBackendManager
     elif eval_backend == EvalBackend.RAG_EVAL:
+        logger.info('Using RAGEvalBackendManager')
         from evalscope.backend.rag_eval import RAGEvalBackendManager
         return RAGEvalBackendManager
     elif eval_backend == EvalBackend.THIRD_PARTY:
@@ -131,8 +135,9 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
         )
         evaluators.append(evaluator)
-        # Update task_config.dataset_args with benchmark metadata
-        task_config.dataset_args[dataset_name] = benchmark.to_dict()
+        # Update task_config.dataset_args with benchmark metadata, except for DataCollection
+        if dataset_name != DataCollection.NAME:
+            task_config.dataset_args[dataset_name] = benchmark.to_dict()
     # dump task_cfg to outputs.configs_dir after creating evaluators
     task_config.dump_yaml(outputs.configs_dir)
@@ -149,17 +154,20 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
         logger.info(f'Overall report table: \n{report_table} \n')
     except Exception:
         logger.error('Failed to generate report table.')
     # Clean up
     if model is not None:
         import gc
-        import torch
         del model
         del evaluators
-        torch.cuda.empty_cache()
         gc.collect()
+        from evalscope.utils.import_utils import check_import
+        if check_import('torch', raise_warning=False):
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
     return eval_results

evalscope/third_party/toolbench_static/llm/swift_infer.py CHANGED Viewed

@@ -1,9 +1,5 @@
-import os
 from dataclasses import dataclass
-from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
-# 设置GPU环境变量
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 @dataclass
 class SwiftInferArgs:

evalscope/utils/argument_utils.py CHANGED Viewed

@@ -61,4 +61,4 @@ def parse_int_or_float(num):
 def get_supported_params(func):
     """Get the supported parameters of a function."""
     sig = signature(func)
-    return list(sig.parameters.keys())
+    return set(sig.parameters.keys())

evalscope/utils/chat_service.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import time
-import torch
 from contextlib import contextmanager
 from functools import partial
 from pydantic import BaseModel, Field
@@ -95,6 +94,7 @@ class TextCompletionResponse(BaseModel):
 class ChatService:
     def __init__(self, model_path, attn_implementation):
+        import torch
         from modelscope import AutoModelForCausalLM, AutoTokenizer
         from transformers import TextIteratorStreamer

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl