evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, ContentText
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.import_utils import check_import
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
PROMPT_TEMPLATE = """
|
|
16
|
+
Translate the following {source_language} sentence into {target_language}:
|
|
17
|
+
|
|
18
|
+
{source_language}: {source_text}
|
|
19
|
+
{target_language}:
|
|
20
|
+
""".strip()
|
|
21
|
+
|
|
22
|
+
LANGUAGE_PAIRS = [
|
|
23
|
+
'en-ar_eg',
|
|
24
|
+
'en-ar_sa',
|
|
25
|
+
'en-bg_bg',
|
|
26
|
+
'en-bn_in',
|
|
27
|
+
'en-ca_es',
|
|
28
|
+
'en-cs_cz',
|
|
29
|
+
'en-da_dk',
|
|
30
|
+
'en-de_de',
|
|
31
|
+
'en-el_gr',
|
|
32
|
+
'en-es_mx',
|
|
33
|
+
'en-et_ee',
|
|
34
|
+
'en-fa_ir',
|
|
35
|
+
'en-fi_fi',
|
|
36
|
+
'en-fil_ph',
|
|
37
|
+
'en-fr_ca',
|
|
38
|
+
'en-fr_fr',
|
|
39
|
+
'en-gu_in',
|
|
40
|
+
'en-he_il',
|
|
41
|
+
'en-hi_in',
|
|
42
|
+
'en-hr_hr',
|
|
43
|
+
'en-hu_hu',
|
|
44
|
+
'en-id_id',
|
|
45
|
+
'en-is_is',
|
|
46
|
+
'en-it_it',
|
|
47
|
+
'en-ja_jp',
|
|
48
|
+
'en-kn_in',
|
|
49
|
+
'en-ko_kr',
|
|
50
|
+
'en-lt_lt',
|
|
51
|
+
'en-lv_lv',
|
|
52
|
+
'en-ml_in',
|
|
53
|
+
'en-mr_in',
|
|
54
|
+
'en-nl_nl',
|
|
55
|
+
'en-no_no',
|
|
56
|
+
'en-pa_in',
|
|
57
|
+
'en-pl_pl',
|
|
58
|
+
'en-pt_br',
|
|
59
|
+
'en-pt_pt',
|
|
60
|
+
'en-ro_ro',
|
|
61
|
+
'en-ru_ru',
|
|
62
|
+
'en-sk_sk',
|
|
63
|
+
'en-sl_si',
|
|
64
|
+
'en-sr_rs',
|
|
65
|
+
'en-sv_se',
|
|
66
|
+
'en-sw_ke',
|
|
67
|
+
'en-sw_tz',
|
|
68
|
+
'en-ta_in',
|
|
69
|
+
'en-te_in',
|
|
70
|
+
'en-th_th',
|
|
71
|
+
'en-tr_tr',
|
|
72
|
+
'en-uk_ua',
|
|
73
|
+
'en-ur_pk',
|
|
74
|
+
'en-vi_vn',
|
|
75
|
+
'en-zh_cn',
|
|
76
|
+
'en-zh_tw',
|
|
77
|
+
'en-zu_za',
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
LANGUAGE_BY_CODE = {
|
|
81
|
+
'ar_eg': 'arabic',
|
|
82
|
+
'ar_sa': 'arabic',
|
|
83
|
+
'bg_bg': 'bulgarian',
|
|
84
|
+
'bn_bd': 'bengali',
|
|
85
|
+
'bn_in': 'bengali',
|
|
86
|
+
'ca_es': 'catalan',
|
|
87
|
+
'cs_cz': 'czech',
|
|
88
|
+
'da_dk': 'danish',
|
|
89
|
+
'de_de': 'german',
|
|
90
|
+
'el_gr': 'greek',
|
|
91
|
+
'es_mx': 'spanish',
|
|
92
|
+
'et_ee': 'estonian',
|
|
93
|
+
'fa_ir': 'farsi',
|
|
94
|
+
'fi_fi': 'finnish',
|
|
95
|
+
'fil_ph': 'filipino',
|
|
96
|
+
'fr_ca': 'french',
|
|
97
|
+
'fr_fr': 'french',
|
|
98
|
+
'gu_in': 'gujarati',
|
|
99
|
+
'he_il': 'hebrew',
|
|
100
|
+
'hi_in': 'hindi',
|
|
101
|
+
'hr_hr': 'croatian',
|
|
102
|
+
'hu_hu': 'hungarian',
|
|
103
|
+
'id_id': 'indonesian',
|
|
104
|
+
'is_is': 'icelandic',
|
|
105
|
+
'it_it': 'italian',
|
|
106
|
+
'ja_jp': 'japanese',
|
|
107
|
+
'kn_in': 'kannada',
|
|
108
|
+
'ko_kr': 'korean',
|
|
109
|
+
'lt_lt': 'lithuanian',
|
|
110
|
+
'lv_lv': 'latvian',
|
|
111
|
+
'ml_in': 'malayalam',
|
|
112
|
+
'mr_in': 'marathi',
|
|
113
|
+
'nl_nl': 'dutch',
|
|
114
|
+
'no_no': 'norwegian',
|
|
115
|
+
'pa_in': 'punjabi',
|
|
116
|
+
'pl_pl': 'polish',
|
|
117
|
+
'pt_br': 'portuguese',
|
|
118
|
+
'pt_pt': 'portuguese',
|
|
119
|
+
'ro_ro': 'romanian',
|
|
120
|
+
'ru_ru': 'russian',
|
|
121
|
+
'sk_sk': 'slovak',
|
|
122
|
+
'sl_si': 'slovenian',
|
|
123
|
+
'sr_rs': 'serbian',
|
|
124
|
+
'sv_se': 'swedish',
|
|
125
|
+
'sw_ke': 'swahili',
|
|
126
|
+
'sw_tz': 'swahili',
|
|
127
|
+
'ta_in': 'tamil',
|
|
128
|
+
'te_in': 'telugu',
|
|
129
|
+
'th_th': 'thai',
|
|
130
|
+
'tr_tr': 'turkish',
|
|
131
|
+
'uk_ua': 'ukrainian',
|
|
132
|
+
'ur_pk': 'urdu',
|
|
133
|
+
'vi_vn': 'vietnamese',
|
|
134
|
+
'zh_cn': 'mandarin',
|
|
135
|
+
'zh_tw': 'mandarin',
|
|
136
|
+
'zu_za': 'zulu',
|
|
137
|
+
'en': 'english',
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@register_benchmark(
|
|
142
|
+
BenchmarkMeta(
|
|
143
|
+
name='wmt24pp',
|
|
144
|
+
pretty_name='WMT2024++',
|
|
145
|
+
dataset_id='extraordinarylab/wmt24pp',
|
|
146
|
+
tags=[Tags.MULTI_LINGUAL, Tags.MT],
|
|
147
|
+
description=(
|
|
148
|
+
'WMT2024 news translation benchmark supporting multiple language pairs. '
|
|
149
|
+
'Each subset represents a specific translation direction'
|
|
150
|
+
),
|
|
151
|
+
subset_list=LANGUAGE_PAIRS,
|
|
152
|
+
eval_split='test',
|
|
153
|
+
metric_list={
|
|
154
|
+
'bleu': {},
|
|
155
|
+
'bert_score': {
|
|
156
|
+
'model_id_or_path': 'AI-ModelScope/xlm-roberta-large',
|
|
157
|
+
'model_type': 'xlm-roberta-large'
|
|
158
|
+
},
|
|
159
|
+
'comet': {
|
|
160
|
+
'model_id_or_path': 'evalscope/wmt22-comet-da',
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
few_shot_num=0,
|
|
164
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
class WMT24PPAdapter(DefaultDataAdapter):
|
|
168
|
+
|
|
169
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
170
|
+
"""Initialize adapter and configure dataset subsets."""
|
|
171
|
+
super().__init__(**kwargs)
|
|
172
|
+
self.reformat_subset = True
|
|
173
|
+
self.use_batch_scoring = True # Enable batch scoring
|
|
174
|
+
|
|
175
|
+
if 'comet' in self.metric_list:
|
|
176
|
+
check_import('comet', 'unbabel-comet', raise_error=True, feature_name='COMETScore Metric')
|
|
177
|
+
|
|
178
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
179
|
+
"""
|
|
180
|
+
Convert a data record to a Sample object.
|
|
181
|
+
"""
|
|
182
|
+
source_text = str(record['source'])
|
|
183
|
+
target_text = str(record['target'])
|
|
184
|
+
language_pair = str(record['language_pair'])
|
|
185
|
+
source_language, target_language = language_pair.split('-')
|
|
186
|
+
|
|
187
|
+
# Format the generation prompt with the text
|
|
188
|
+
input_prompt = self.prompt_template.format(
|
|
189
|
+
source_text=source_text,
|
|
190
|
+
source_language=LANGUAGE_BY_CODE[source_language],
|
|
191
|
+
target_language=LANGUAGE_BY_CODE[target_language],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Create content list for the input
|
|
195
|
+
content_list = [ContentText(text=input_prompt)]
|
|
196
|
+
|
|
197
|
+
return Sample(
|
|
198
|
+
input=[ChatMessageUser(content=content_list)],
|
|
199
|
+
target=target_text,
|
|
200
|
+
subset_key=language_pair,
|
|
201
|
+
metadata={
|
|
202
|
+
'source_text': source_text,
|
|
203
|
+
'target_text': target_text,
|
|
204
|
+
'source_language': source_language,
|
|
205
|
+
'target_language': target_language,
|
|
206
|
+
},
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def match_score(
|
|
210
|
+
self,
|
|
211
|
+
original_prediction: str,
|
|
212
|
+
filtered_prediction: str,
|
|
213
|
+
reference: str,
|
|
214
|
+
task_state: TaskState,
|
|
215
|
+
) -> Score:
|
|
216
|
+
"""Compute per-sample translation metrics."""
|
|
217
|
+
# Create a Score object for the current sample
|
|
218
|
+
score = Score(
|
|
219
|
+
prediction=original_prediction,
|
|
220
|
+
extracted_prediction=filtered_prediction,
|
|
221
|
+
value={},
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# ---- BLEU ----
|
|
225
|
+
if 'bleu' in self.metric_list:
|
|
226
|
+
try:
|
|
227
|
+
from evalscope.metrics import bleu_ngram_one_sample
|
|
228
|
+
|
|
229
|
+
bleu_results = bleu_ngram_one_sample(filtered_prediction, reference)
|
|
230
|
+
score.value.update(bleu_results)
|
|
231
|
+
except Exception as e:
|
|
232
|
+
logger.warning(f'[WMT24PPAdapter] BLEU single-sample calculation failed: {e}')
|
|
233
|
+
return score
|
|
234
|
+
|
|
235
|
+
def batch_match_score(
|
|
236
|
+
self,
|
|
237
|
+
original_predictions: List[str],
|
|
238
|
+
filtered_predictions: List[str],
|
|
239
|
+
references: List[str],
|
|
240
|
+
task_states: List[TaskState],
|
|
241
|
+
) -> List[Score]:
|
|
242
|
+
"""Compute batched translation metrics (BLEU, BERTScore, COMET)."""
|
|
243
|
+
scores: List[Score] = []
|
|
244
|
+
for i in range(len(original_predictions)):
|
|
245
|
+
score = Score(
|
|
246
|
+
extracted_prediction=filtered_predictions[i],
|
|
247
|
+
prediction=original_predictions[i],
|
|
248
|
+
value={},
|
|
249
|
+
)
|
|
250
|
+
scores.append(score)
|
|
251
|
+
|
|
252
|
+
# ---- BLEU (per-sample within batch) ----
|
|
253
|
+
if 'bleu' in self.metric_list:
|
|
254
|
+
try:
|
|
255
|
+
from evalscope.metrics import bleu_ngram_one_sample
|
|
256
|
+
|
|
257
|
+
for i in range(len(scores)):
|
|
258
|
+
bleu_results = bleu_ngram_one_sample(filtered_predictions[i], references[i])
|
|
259
|
+
scores[i].value.update(bleu_results)
|
|
260
|
+
except Exception as e:
|
|
261
|
+
logger.warning(f'[WMT24PPAdapter] BLEU batch calculation failed: {e}')
|
|
262
|
+
|
|
263
|
+
# ---- BERTScore ----
|
|
264
|
+
if 'bert_score' in self.metric_list:
|
|
265
|
+
try:
|
|
266
|
+
from evalscope.metrics.metric import BertScore
|
|
267
|
+
|
|
268
|
+
score_args = self.metric_list.get('bert_score', {})
|
|
269
|
+
bert_scorer = BertScore(**score_args)
|
|
270
|
+
bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
|
|
271
|
+
for i in range(len(scores)):
|
|
272
|
+
scores[i].value.update({'bert_score': bert_score_f1[i]})
|
|
273
|
+
except Exception as e:
|
|
274
|
+
logger.warning(f'[WMT24PPAdapter] BERTScore batch calculation failed: {e}')
|
|
275
|
+
|
|
276
|
+
# ---- COMET ----
|
|
277
|
+
if 'comet' in self.metric_list:
|
|
278
|
+
try:
|
|
279
|
+
from evalscope.metrics.metric import COMETScore
|
|
280
|
+
|
|
281
|
+
score_args = self.metric_list.get('comet', {})
|
|
282
|
+
comet_scorer = COMETScore(**score_args)
|
|
283
|
+
data = [{
|
|
284
|
+
'src': st.metadata.get('source_text'),
|
|
285
|
+
'mt': pred,
|
|
286
|
+
'ref': ref
|
|
287
|
+
} for pred, ref, st in zip(filtered_predictions, references, task_states)]
|
|
288
|
+
comet_scores = comet_scorer.apply(data)
|
|
289
|
+
for i in range(len(scores)):
|
|
290
|
+
scores[i].value.update({'comet': comet_scores[i]})
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.warning(f'[WMT24PPAdapter] COMET batch calculation failed: {e}')
|
|
293
|
+
|
|
294
|
+
return scores
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64, compress_image_to_limit
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
# 定义提示模板
|
|
15
|
+
PROMPT_TEMPLATE = """{question}
|
|
16
|
+
\n\n\nLet's think step by step and give the final answer in curly braces,
|
|
17
|
+
like this: {{final answer}}"
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
SUBSET_LIST = ['default']
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_benchmark(
|
|
24
|
+
BenchmarkMeta(
|
|
25
|
+
name='zerobench',
|
|
26
|
+
pretty_name='ZeroBench',
|
|
27
|
+
dataset_id='evalscope/zerobench',
|
|
28
|
+
tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MULTI_MODAL],
|
|
29
|
+
description=
|
|
30
|
+
'ZeroBench is a challenging visual reasoning benchmark for Large Multimodal Models (LMMs). It consists of a main set of 100 high-quality, manually curated questions covering numerous domains, reasoning types and image type. Questions in ZeroBench have been designed and calibrated to be beyond the capabilities of current frontier models. As such, none of the evaluated models achieves a non-zero pass@1 (with greedy decoding) or 5/5 reliability score.',
|
|
31
|
+
subset_list=SUBSET_LIST,
|
|
32
|
+
metric_list=['acc'],
|
|
33
|
+
eval_split='zerobench',
|
|
34
|
+
train_split='zerobench_subquestions',
|
|
35
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
class ZeroBenchAdapter(VisionLanguageAdapter):
|
|
39
|
+
|
|
40
|
+
def __init__(self, *args, **kwargs):
|
|
41
|
+
super().__init__(*args, **kwargs)
|
|
42
|
+
|
|
43
|
+
self._use_llm_judge = True
|
|
44
|
+
|
|
45
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
46
|
+
question = record['question_text']
|
|
47
|
+
content_list: List[Content] = [ContentText(text=self.prompt_template.format(question=question))]
|
|
48
|
+
image = record['question_images_decoded']
|
|
49
|
+
if len(image) > 0:
|
|
50
|
+
for img in image:
|
|
51
|
+
# Ensure image is under OpenAI's 10MB data-URI limit by compressing if needed
|
|
52
|
+
processed_bytes, fmt = compress_image_to_limit(img['bytes'], 10_000_000)
|
|
53
|
+
image_base64 = bytes_to_base64(processed_bytes, format=fmt, add_header=True)
|
|
54
|
+
content_list.append(ContentImage(image=image_base64))
|
|
55
|
+
|
|
56
|
+
metadata = {
|
|
57
|
+
'question_id': record['question_id'],
|
|
58
|
+
'question_images': record['question_images'],
|
|
59
|
+
'image_attribution': record['image_attribution']
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return Sample(
|
|
63
|
+
input=[ChatMessageUser(content=content_list)], target=record['question_answer'], metadata=metadata
|
|
64
|
+
)
|
evalscope/cli/start_app.py
CHANGED
|
@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
|
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.app import create_app
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import create_app from evalscope.app, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[app]'`."
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
create_app(self.args)
|
evalscope/cli/start_perf.py
CHANGED
|
@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
|
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[perf]'`."
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
run_perf_benchmark(self.args)
|
evalscope/config.py
CHANGED
|
@@ -6,7 +6,7 @@ from argparse import Namespace
|
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from typing import Dict, List, Optional, Union
|
|
8
8
|
|
|
9
|
-
from evalscope.api.model import GenerateConfig
|
|
9
|
+
from evalscope.api.model import GenerateConfig, Model, ModelAPI
|
|
10
10
|
from evalscope.constants import (
|
|
11
11
|
DEFAULT_DATASET_CACHE_DIR,
|
|
12
12
|
DEFAULT_WORK_DIR,
|
|
@@ -15,12 +15,13 @@ from evalscope.constants import (
|
|
|
15
15
|
HubType,
|
|
16
16
|
JudgeStrategy,
|
|
17
17
|
ModelTask,
|
|
18
|
-
OutputType,
|
|
19
18
|
)
|
|
20
19
|
from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
|
|
21
20
|
from evalscope.utils.deprecation_utils import deprecated_warning
|
|
21
|
+
from evalscope.utils.import_utils import check_import
|
|
22
22
|
from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
|
|
23
23
|
from evalscope.utils.logger import get_logger
|
|
24
|
+
from evalscope.version import __version__ as evalscope_version
|
|
24
25
|
|
|
25
26
|
logger = get_logger()
|
|
26
27
|
|
|
@@ -28,51 +29,118 @@ logger = get_logger()
|
|
|
28
29
|
@dataclass
|
|
29
30
|
class TaskConfig(BaseArgument):
|
|
30
31
|
# Model-related arguments
|
|
31
|
-
model: Optional[str] = None
|
|
32
|
+
model: Optional[Union[str, Model, ModelAPI]] = None
|
|
33
|
+
"""The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
|
|
34
|
+
|
|
32
35
|
model_id: Optional[str] = None
|
|
36
|
+
"""Unique identifier for the model. Auto-generated from model name if not provided."""
|
|
37
|
+
|
|
33
38
|
model_args: Dict = field(default_factory=dict)
|
|
39
|
+
"""Additional arguments to pass to the model during initialization."""
|
|
40
|
+
|
|
34
41
|
model_task: str = ModelTask.TEXT_GENERATION
|
|
42
|
+
"""The type of task the model performs (e.g., text generation, image generation)."""
|
|
35
43
|
|
|
36
44
|
# Template-related arguments
|
|
37
45
|
chat_template: Optional[str] = None
|
|
46
|
+
"""Chat template to use for formatting conversations with the model."""
|
|
38
47
|
|
|
39
48
|
# Dataset-related arguments
|
|
40
49
|
datasets: List[str] = field(default_factory=list)
|
|
50
|
+
"""List of dataset names to evaluate the model on."""
|
|
51
|
+
|
|
41
52
|
dataset_args: Dict = field(default_factory=dict)
|
|
53
|
+
"""Additional arguments to pass to datasets during loading."""
|
|
54
|
+
|
|
42
55
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
56
|
+
"""Directory where datasets are cached locally."""
|
|
57
|
+
|
|
43
58
|
dataset_hub: str = HubType.MODELSCOPE
|
|
44
|
-
|
|
59
|
+
"""Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
|
|
60
|
+
|
|
61
|
+
repeats: int = 1
|
|
62
|
+
"""Number of times to repeat the dataset items for k-metrics evaluation."""
|
|
45
63
|
|
|
46
64
|
# Generation configuration arguments
|
|
47
65
|
generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
|
|
66
|
+
"""Configuration parameters for text/image generation."""
|
|
48
67
|
|
|
49
68
|
# Evaluation-related arguments
|
|
50
69
|
eval_type: str = EvalType.CHECKPOINT
|
|
70
|
+
"""Type of evaluation: checkpoint, service, or mock."""
|
|
71
|
+
|
|
51
72
|
eval_backend: str = EvalBackend.NATIVE
|
|
73
|
+
"""Backend framework to use for evaluation."""
|
|
74
|
+
|
|
52
75
|
eval_config: Union[str, Dict, None] = None
|
|
76
|
+
"""Additional evaluation configuration parameters."""
|
|
77
|
+
|
|
53
78
|
limit: Optional[Union[int, float]] = None
|
|
79
|
+
"""Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
|
|
80
|
+
|
|
54
81
|
eval_batch_size: int = 1
|
|
82
|
+
"""Batch size for evaluation processing."""
|
|
55
83
|
|
|
56
84
|
# Cache and working directory arguments
|
|
57
85
|
use_cache: Optional[str] = None
|
|
86
|
+
"""Whether to use cached results and which cache strategy to apply."""
|
|
87
|
+
|
|
58
88
|
rerun_review: bool = False
|
|
89
|
+
"""Whether to rerun the review process even if results exist."""
|
|
90
|
+
|
|
59
91
|
work_dir: str = DEFAULT_WORK_DIR
|
|
92
|
+
"""Working directory for storing evaluation results and temporary files."""
|
|
60
93
|
|
|
61
94
|
# Debug and runtime mode arguments
|
|
62
95
|
ignore_errors: bool = False
|
|
96
|
+
"""Whether to continue evaluation when encountering errors."""
|
|
97
|
+
|
|
63
98
|
debug: bool = False
|
|
64
|
-
|
|
99
|
+
"""Enable debug mode for detailed logging and error reporting."""
|
|
100
|
+
|
|
65
101
|
seed: Optional[int] = 42
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
102
|
+
"""Random seed for reproducible results."""
|
|
103
|
+
|
|
104
|
+
api_url: Optional[str] = None
|
|
105
|
+
"""API endpoint URL for server-based model evaluation."""
|
|
106
|
+
|
|
107
|
+
api_key: Optional[str] = 'EMPTY'
|
|
108
|
+
"""API key for authenticating with server-based models."""
|
|
109
|
+
|
|
110
|
+
timeout: Optional[float] = None
|
|
111
|
+
"""Request timeout in seconds for server-based models."""
|
|
112
|
+
|
|
113
|
+
stream: Optional[bool] = None
|
|
114
|
+
"""Whether to use streaming responses for server-based models."""
|
|
70
115
|
|
|
71
116
|
# LLMJudge arguments
|
|
72
117
|
judge_strategy: str = JudgeStrategy.AUTO
|
|
118
|
+
"""Strategy for LLM-based judgment (auto, single, pairwise)."""
|
|
119
|
+
|
|
73
120
|
judge_worker_num: int = 1
|
|
121
|
+
"""Number of worker processes for parallel LLM judging."""
|
|
122
|
+
|
|
74
123
|
judge_model_args: Optional[Dict] = field(default_factory=dict)
|
|
124
|
+
"""Additional arguments for the judge model configuration."""
|
|
125
|
+
|
|
75
126
|
analysis_report: bool = False
|
|
127
|
+
"""Whether to generate detailed analysis reports after evaluation."""
|
|
128
|
+
|
|
129
|
+
# Sandbox configuration arguments
|
|
130
|
+
use_sandbox: bool = False
|
|
131
|
+
"""Whether to execute code in a sandboxed environment."""
|
|
132
|
+
|
|
133
|
+
sandbox_type: Optional[str] = 'docker'
|
|
134
|
+
"""Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
|
|
135
|
+
|
|
136
|
+
sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
|
|
137
|
+
"""Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
|
|
138
|
+
|
|
139
|
+
sandbox_config: Optional[Dict] = field(default_factory=dict)
|
|
140
|
+
"""Configuration for sandboxed code execution environments."""
|
|
141
|
+
|
|
142
|
+
evalscope_version: Optional[str] = evalscope_version
|
|
143
|
+
"""EvalScope version used for the evaluation."""
|
|
76
144
|
|
|
77
145
|
def __post_init__(self):
|
|
78
146
|
self.__init_model_and_id()
|
|
@@ -82,20 +150,22 @@ class TaskConfig(BaseArgument):
|
|
|
82
150
|
# Set default generation_config and model_args
|
|
83
151
|
self.__init_default_generation_config()
|
|
84
152
|
self.__init_default_model_args()
|
|
153
|
+
self.__init_default_sandbox_config()
|
|
85
154
|
|
|
86
155
|
def __init_model_and_id(self):
|
|
87
156
|
# Set model to DummyCustomModel if not provided
|
|
88
157
|
if self.model is None:
|
|
89
158
|
self.model = self.model_task
|
|
90
159
|
self.eval_type = EvalType.MOCK_LLM
|
|
91
|
-
else:
|
|
92
|
-
if self.model_task == ModelTask.IMAGE_GENERATION:
|
|
93
|
-
self.eval_type = EvalType.TEXT2IMAGE
|
|
94
160
|
|
|
95
161
|
# Set model_id if not provided
|
|
96
162
|
if not self.model_id:
|
|
97
|
-
if self.model:
|
|
163
|
+
if isinstance(self.model, str):
|
|
98
164
|
self.model_id = safe_filename(os.path.basename(self.model))
|
|
165
|
+
elif isinstance(self.model, Model):
|
|
166
|
+
self.model_id = safe_filename(self.model.name)
|
|
167
|
+
elif isinstance(self.model, ModelAPI):
|
|
168
|
+
self.model_id = safe_filename(self.model.model_name)
|
|
99
169
|
else:
|
|
100
170
|
self.model_id = 'dummy_model'
|
|
101
171
|
|
|
@@ -113,6 +183,11 @@ class TaskConfig(BaseArgument):
|
|
|
113
183
|
'num_inference_steps': 50,
|
|
114
184
|
'guidance_scale': 9.0,
|
|
115
185
|
}
|
|
186
|
+
if self.eval_batch_size != 1:
|
|
187
|
+
logger.warning(
|
|
188
|
+
'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
|
|
189
|
+
)
|
|
190
|
+
self.eval_batch_size = 1
|
|
116
191
|
elif self.model_task == ModelTask.TEXT_GENERATION:
|
|
117
192
|
if self.eval_type == EvalType.CHECKPOINT:
|
|
118
193
|
self.generation_config = {
|
|
@@ -125,7 +200,6 @@ class TaskConfig(BaseArgument):
|
|
|
125
200
|
}
|
|
126
201
|
elif self.eval_type == EvalType.SERVICE:
|
|
127
202
|
self.generation_config = {
|
|
128
|
-
'max_tokens': 2048,
|
|
129
203
|
'temperature': 0.0,
|
|
130
204
|
}
|
|
131
205
|
if isinstance(self.generation_config, dict):
|
|
@@ -138,14 +212,14 @@ class TaskConfig(BaseArgument):
|
|
|
138
212
|
if self.timeout is not None:
|
|
139
213
|
deprecated_warning(
|
|
140
214
|
logger,
|
|
141
|
-
'The `timeout` parameter is deprecated and will be removed in
|
|
215
|
+
'The `timeout` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.timeout` instead.'
|
|
142
216
|
)
|
|
143
217
|
self.generation_config.timeout = self.timeout
|
|
144
218
|
|
|
145
219
|
if self.stream is not None:
|
|
146
220
|
deprecated_warning(
|
|
147
221
|
logger,
|
|
148
|
-
'The `stream` parameter is deprecated and will be removed in
|
|
222
|
+
'The `stream` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.stream` instead.'
|
|
149
223
|
)
|
|
150
224
|
self.generation_config.stream = self.stream
|
|
151
225
|
|
|
@@ -154,7 +228,7 @@ class TaskConfig(BaseArgument):
|
|
|
154
228
|
self.generation_config.n = 1
|
|
155
229
|
deprecated_warning(
|
|
156
230
|
logger,
|
|
157
|
-
'The `n` parameter in generation_config is deprecated and will be removed in
|
|
231
|
+
'The `n` parameter in generation_config is deprecated and will be removed in v2.0.0. Use `TaskConfig.repeats` instead.'
|
|
158
232
|
)
|
|
159
233
|
|
|
160
234
|
def __init_default_model_args(self):
|
|
@@ -167,6 +241,14 @@ class TaskConfig(BaseArgument):
|
|
|
167
241
|
'precision': 'torch.float16',
|
|
168
242
|
}
|
|
169
243
|
|
|
244
|
+
def __init_default_sandbox_config(self):
|
|
245
|
+
if not self.use_sandbox:
|
|
246
|
+
return
|
|
247
|
+
check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
|
|
248
|
+
|
|
249
|
+
if not self.sandbox_type:
|
|
250
|
+
self.sandbox_type = 'docker'
|
|
251
|
+
|
|
170
252
|
def update(self, other: Union['TaskConfig', dict]):
|
|
171
253
|
if isinstance(other, TaskConfig):
|
|
172
254
|
other = other.to_dict()
|
|
@@ -182,9 +264,12 @@ class TaskConfig(BaseArgument):
|
|
|
182
264
|
logger.warning(f'Failed to dump overall task config: {e}')
|
|
183
265
|
|
|
184
266
|
def to_dict(self):
|
|
185
|
-
result = copy.
|
|
267
|
+
result = copy.copy(self.__dict__)
|
|
186
268
|
del result['api_key'] # Do not expose api_key in the config
|
|
187
269
|
|
|
270
|
+
if isinstance(self.model, (Model, ModelAPI)):
|
|
271
|
+
result['model'] = self.model.__class__.__name__
|
|
272
|
+
|
|
188
273
|
if isinstance(self.generation_config, GenerateConfig):
|
|
189
274
|
result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
|
|
190
275
|
return result
|
evalscope/constants.py
CHANGED
|
@@ -15,6 +15,8 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old versio
|
|
|
15
15
|
DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
|
|
16
16
|
os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
|
|
17
17
|
) # ~/.cache/evalscope
|
|
18
|
+
IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
|
|
19
|
+
HEARTBEAT_INTERVAL_SEC = 60 # 60 seconds
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
class HubType:
|
|
@@ -70,6 +72,7 @@ class EvalType:
|
|
|
70
72
|
CHECKPOINT = 'llm_ckpt' # native model checkpoint
|
|
71
73
|
SERVICE = 'openai_api' # model service
|
|
72
74
|
TEXT2IMAGE = 'text2image' # image generation service
|
|
75
|
+
IMAGE_EDITING = 'image_editing' # image editing service
|
|
73
76
|
|
|
74
77
|
|
|
75
78
|
class OutputType:
|
|
@@ -119,6 +122,7 @@ class Tags:
|
|
|
119
122
|
CHINESE = 'Chinese'
|
|
120
123
|
COMMONSENSE = 'Commonsense'
|
|
121
124
|
QA = 'QA'
|
|
125
|
+
NER = 'NER'
|
|
122
126
|
READING_COMPREHENSION = 'ReadingComprehension'
|
|
123
127
|
CUSTOM = 'Custom'
|
|
124
128
|
INSTRUCTION_FOLLOWING = 'InstructionFollowing'
|
|
@@ -127,3 +131,17 @@ class Tags:
|
|
|
127
131
|
RETRIEVAL = 'Retrieval'
|
|
128
132
|
FUNCTION_CALLING = 'FunctionCalling'
|
|
129
133
|
TEXT_TO_IMAGE = 'TextToImage'
|
|
134
|
+
IMAGE_EDITING = 'ImageEditing'
|
|
135
|
+
MULTI_MODAL = 'MultiModal'
|
|
136
|
+
MULTI_LINGUAL = 'MultiLingual'
|
|
137
|
+
MULTI_TURN = 'MultiTurn'
|
|
138
|
+
YES_NO = 'Yes/No'
|
|
139
|
+
HALLUCINATION = 'Hallucination'
|
|
140
|
+
MEDICAL = 'Medical'
|
|
141
|
+
AGENT = 'Agent'
|
|
142
|
+
MT = 'MachineTranslation'
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class FileConstants:
|
|
146
|
+
IMAGE_PATH = 'image_path'
|
|
147
|
+
ID = 'id'
|