PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/wmt/wmt24_adapter.py ADDED Viewed

@@ -0,0 +1,294 @@
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, ContentText
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.import_utils import check_import
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+PROMPT_TEMPLATE = """
+Translate the following {source_language} sentence into {target_language}:
+{source_language}: {source_text}
+{target_language}:
+""".strip()
+LANGUAGE_PAIRS = [
+    'en-ar_eg',
+    'en-ar_sa',
+    'en-bg_bg',
+    'en-bn_in',
+    'en-ca_es',
+    'en-cs_cz',
+    'en-da_dk',
+    'en-de_de',
+    'en-el_gr',
+    'en-es_mx',
+    'en-et_ee',
+    'en-fa_ir',
+    'en-fi_fi',
+    'en-fil_ph',
+    'en-fr_ca',
+    'en-fr_fr',
+    'en-gu_in',
+    'en-he_il',
+    'en-hi_in',
+    'en-hr_hr',
+    'en-hu_hu',
+    'en-id_id',
+    'en-is_is',
+    'en-it_it',
+    'en-ja_jp',
+    'en-kn_in',
+    'en-ko_kr',
+    'en-lt_lt',
+    'en-lv_lv',
+    'en-ml_in',
+    'en-mr_in',
+    'en-nl_nl',
+    'en-no_no',
+    'en-pa_in',
+    'en-pl_pl',
+    'en-pt_br',
+    'en-pt_pt',
+    'en-ro_ro',
+    'en-ru_ru',
+    'en-sk_sk',
+    'en-sl_si',
+    'en-sr_rs',
+    'en-sv_se',
+    'en-sw_ke',
+    'en-sw_tz',
+    'en-ta_in',
+    'en-te_in',
+    'en-th_th',
+    'en-tr_tr',
+    'en-uk_ua',
+    'en-ur_pk',
+    'en-vi_vn',
+    'en-zh_cn',
+    'en-zh_tw',
+    'en-zu_za',
+]
+LANGUAGE_BY_CODE = {
+    'ar_eg': 'arabic',
+    'ar_sa': 'arabic',
+    'bg_bg': 'bulgarian',
+    'bn_bd': 'bengali',
+    'bn_in': 'bengali',
+    'ca_es': 'catalan',
+    'cs_cz': 'czech',
+    'da_dk': 'danish',
+    'de_de': 'german',
+    'el_gr': 'greek',
+    'es_mx': 'spanish',
+    'et_ee': 'estonian',
+    'fa_ir': 'farsi',
+    'fi_fi': 'finnish',
+    'fil_ph': 'filipino',
+    'fr_ca': 'french',
+    'fr_fr': 'french',
+    'gu_in': 'gujarati',
+    'he_il': 'hebrew',
+    'hi_in': 'hindi',
+    'hr_hr': 'croatian',
+    'hu_hu': 'hungarian',
+    'id_id': 'indonesian',
+    'is_is': 'icelandic',
+    'it_it': 'italian',
+    'ja_jp': 'japanese',
+    'kn_in': 'kannada',
+    'ko_kr': 'korean',
+    'lt_lt': 'lithuanian',
+    'lv_lv': 'latvian',
+    'ml_in': 'malayalam',
+    'mr_in': 'marathi',
+    'nl_nl': 'dutch',
+    'no_no': 'norwegian',
+    'pa_in': 'punjabi',
+    'pl_pl': 'polish',
+    'pt_br': 'portuguese',
+    'pt_pt': 'portuguese',
+    'ro_ro': 'romanian',
+    'ru_ru': 'russian',
+    'sk_sk': 'slovak',
+    'sl_si': 'slovenian',
+    'sr_rs': 'serbian',
+    'sv_se': 'swedish',
+    'sw_ke': 'swahili',
+    'sw_tz': 'swahili',
+    'ta_in': 'tamil',
+    'te_in': 'telugu',
+    'th_th': 'thai',
+    'tr_tr': 'turkish',
+    'uk_ua': 'ukrainian',
+    'ur_pk': 'urdu',
+    'vi_vn': 'vietnamese',
+    'zh_cn': 'mandarin',
+    'zh_tw': 'mandarin',
+    'zu_za': 'zulu',
+    'en': 'english',
+}
+@register_benchmark(
+    BenchmarkMeta(
+        name='wmt24pp',
+        pretty_name='WMT2024++',
+        dataset_id='extraordinarylab/wmt24pp',
+        tags=[Tags.MULTI_LINGUAL, Tags.MT],
+        description=(
+            'WMT2024 news translation benchmark supporting multiple language pairs. '
+            'Each subset represents a specific translation direction'
+        ),
+        subset_list=LANGUAGE_PAIRS,
+        eval_split='test',
+        metric_list={
+            'bleu': {},
+            'bert_score': {
+                'model_id_or_path': 'AI-ModelScope/xlm-roberta-large',
+                'model_type': 'xlm-roberta-large'
+            },
+            'comet': {
+                'model_id_or_path': 'evalscope/wmt22-comet-da',
+            }
+        },
+        few_shot_num=0,
+        prompt_template=PROMPT_TEMPLATE,
+    )
+)
+class WMT24PPAdapter(DefaultDataAdapter):
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize adapter and configure dataset subsets."""
+        super().__init__(**kwargs)
+        self.reformat_subset = True
+        self.use_batch_scoring = True  # Enable batch scoring
+        if 'comet' in self.metric_list:
+            check_import('comet', 'unbabel-comet', raise_error=True, feature_name='COMETScore Metric')
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """
+        Convert a data record to a Sample object.
+        """
+        source_text = str(record['source'])
+        target_text = str(record['target'])
+        language_pair = str(record['language_pair'])
+        source_language, target_language = language_pair.split('-')
+        # Format the generation prompt with the text
+        input_prompt = self.prompt_template.format(
+            source_text=source_text,
+            source_language=LANGUAGE_BY_CODE[source_language],
+            target_language=LANGUAGE_BY_CODE[target_language],
+        )
+        # Create content list for the input
+        content_list = [ContentText(text=input_prompt)]
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=target_text,
+            subset_key=language_pair,
+            metadata={
+                'source_text': source_text,
+                'target_text': target_text,
+                'source_language': source_language,
+                'target_language': target_language,
+            },
+        )
+    def match_score(
+        self,
+        original_prediction: str,
+        filtered_prediction: str,
+        reference: str,
+        task_state: TaskState,
+    ) -> Score:
+        """Compute per-sample translation metrics."""
+        # Create a Score object for the current sample
+        score = Score(
+            prediction=original_prediction,
+            extracted_prediction=filtered_prediction,
+            value={},
+        )
+        # ---- BLEU ----
+        if 'bleu' in self.metric_list:
+            try:
+                from evalscope.metrics import bleu_ngram_one_sample
+                bleu_results = bleu_ngram_one_sample(filtered_prediction, reference)
+                score.value.update(bleu_results)
+            except Exception as e:
+                logger.warning(f'[WMT24PPAdapter] BLEU single-sample calculation failed: {e}')
+        return score
+    def batch_match_score(
+        self,
+        original_predictions: List[str],
+        filtered_predictions: List[str],
+        references: List[str],
+        task_states: List[TaskState],
+    ) -> List[Score]:
+        """Compute batched translation metrics (BLEU, BERTScore, COMET)."""
+        scores: List[Score] = []
+        for i in range(len(original_predictions)):
+            score = Score(
+                extracted_prediction=filtered_predictions[i],
+                prediction=original_predictions[i],
+                value={},
+            )
+            scores.append(score)
+        # ---- BLEU (per-sample within batch) ----
+        if 'bleu' in self.metric_list:
+            try:
+                from evalscope.metrics import bleu_ngram_one_sample
+                for i in range(len(scores)):
+                    bleu_results = bleu_ngram_one_sample(filtered_predictions[i], references[i])
+                    scores[i].value.update(bleu_results)
+            except Exception as e:
+                logger.warning(f'[WMT24PPAdapter] BLEU batch calculation failed: {e}')
+        # ---- BERTScore ----
+        if 'bert_score' in self.metric_list:
+            try:
+                from evalscope.metrics.metric import BertScore
+                score_args = self.metric_list.get('bert_score', {})
+                bert_scorer = BertScore(**score_args)
+                bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
+                for i in range(len(scores)):
+                    scores[i].value.update({'bert_score': bert_score_f1[i]})
+            except Exception as e:
+                logger.warning(f'[WMT24PPAdapter] BERTScore batch calculation failed: {e}')
+        # ---- COMET ----
+        if 'comet' in self.metric_list:
+            try:
+                from evalscope.metrics.metric import COMETScore
+                score_args = self.metric_list.get('comet', {})
+                comet_scorer = COMETScore(**score_args)
+                data = [{
+                    'src': st.metadata.get('source_text'),
+                    'mt': pred,
+                    'ref': ref
+                } for pred, ref, st in zip(filtered_predictions, references, task_states)]
+                comet_scores = comet_scorer.apply(data)
+                for i in range(len(scores)):
+                    scores[i].value.update({'comet': comet_scores[i]})
+            except Exception as e:
+                logger.warning(f'[WMT24PPAdapter] COMET batch calculation failed: {e}')
+        return scores

evalscope/benchmarks/zerobench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/zerobench/zerobench_adapter.py ADDED Viewed

@@ -0,0 +1,64 @@
+# flake8: noqa: E501
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64, compress_image_to_limit
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+# 定义提示模板
+PROMPT_TEMPLATE = """{question}
+\n\n\nLet's think step by step and give the final answer in curly braces,
+like this: {{final answer}}"
+"""
+SUBSET_LIST = ['default']
+@register_benchmark(
+    BenchmarkMeta(
+        name='zerobench',
+        pretty_name='ZeroBench',
+        dataset_id='evalscope/zerobench',
+        tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MULTI_MODAL],
+        description=
+        'ZeroBench is a challenging visual reasoning benchmark for Large Multimodal Models (LMMs). It consists of a main set of 100 high-quality, manually curated questions covering numerous domains, reasoning types and image type. Questions in ZeroBench have been designed and calibrated to be beyond the capabilities of current frontier models. As such, none of the evaluated models achieves a non-zero pass@1 (with greedy decoding) or 5/5 reliability score.',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='zerobench',
+        train_split='zerobench_subquestions',
+        prompt_template=PROMPT_TEMPLATE,
+    )
+)
+class ZeroBenchAdapter(VisionLanguageAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._use_llm_judge = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        question = record['question_text']
+        content_list: List[Content] = [ContentText(text=self.prompt_template.format(question=question))]
+        image = record['question_images_decoded']
+        if len(image) > 0:
+            for img in image:
+                # Ensure image is under OpenAI's 10MB data-URI limit by compressing if needed
+                processed_bytes, fmt = compress_image_to_limit(img['bytes'], 10_000_000)
+                image_base64 = bytes_to_base64(processed_bytes, format=fmt, add_header=True)
+                content_list.append(ContentImage(image=image_base64))
+        metadata = {
+            'question_id': record['question_id'],
+            'question_images': record['question_images'],
+            'image_attribution': record['image_attribution']
+        }
+        return Sample(
+            input=[ChatMessageUser(content=content_list)], target=record['question_answer'], metadata=metadata
+        )

evalscope/cli/start_app.py CHANGED Viewed

@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
         parser.set_defaults(func=subparser_func)
     def execute(self):
-        from evalscope.app import create_app
+        try:
+            from evalscope.app import create_app
+        except ImportError as e:
+            raise ImportError(
+                f'Failed to import create_app from evalscope.app, due to {e}. '
+                "Please run `pip install 'evalscope[app]'`."
+            )
         create_app(self.args)

evalscope/cli/start_perf.py CHANGED Viewed

@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
         parser.set_defaults(func=subparser_func)
     def execute(self):
-        from evalscope.perf.main import run_perf_benchmark
+        try:
+            from evalscope.perf.main import run_perf_benchmark
+        except ImportError as e:
+            raise ImportError(
+                f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
+                "Please run `pip install 'evalscope[perf]'`."
+            )
         run_perf_benchmark(self.args)

evalscope/config.py CHANGED Viewed

@@ -6,7 +6,7 @@ from argparse import Namespace
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
-from evalscope.api.model import GenerateConfig
+from evalscope.api.model import GenerateConfig, Model, ModelAPI
 from evalscope.constants import (
     DEFAULT_DATASET_CACHE_DIR,
     DEFAULT_WORK_DIR,
@@ -15,12 +15,13 @@ from evalscope.constants import (
     HubType,
     JudgeStrategy,
     ModelTask,
-    OutputType,
 )
 from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
 from evalscope.utils.deprecation_utils import deprecated_warning
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
 from evalscope.utils.logger import get_logger
+from evalscope.version import __version__ as evalscope_version
 logger = get_logger()
@@ -28,51 +29,118 @@ logger = get_logger()
 @dataclass
 class TaskConfig(BaseArgument):
     # Model-related arguments
-    model: Optional[str] = None
+    model: Optional[Union[str, Model, ModelAPI]] = None
+    """The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
     model_id: Optional[str] = None
+    """Unique identifier for the model. Auto-generated from model name if not provided."""
     model_args: Dict = field(default_factory=dict)
+    """Additional arguments to pass to the model during initialization."""
     model_task: str = ModelTask.TEXT_GENERATION
+    """The type of task the model performs (e.g., text generation, image generation)."""
     # Template-related arguments
     chat_template: Optional[str] = None
+    """Chat template to use for formatting conversations with the model."""
     # Dataset-related arguments
     datasets: List[str] = field(default_factory=list)
+    """List of dataset names to evaluate the model on."""
     dataset_args: Dict = field(default_factory=dict)
+    """Additional arguments to pass to datasets during loading."""
     dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
+    """Directory where datasets are cached locally."""
     dataset_hub: str = HubType.MODELSCOPE
-    repeats: int = 1  # Number of times to repeat the dataset items for k-metrics
+    """Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
+    repeats: int = 1
+    """Number of times to repeat the dataset items for k-metrics evaluation."""
     # Generation configuration arguments
     generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
+    """Configuration parameters for text/image generation."""
     # Evaluation-related arguments
     eval_type: str = EvalType.CHECKPOINT
+    """Type of evaluation: checkpoint, service, or mock."""
     eval_backend: str = EvalBackend.NATIVE
+    """Backend framework to use for evaluation."""
     eval_config: Union[str, Dict, None] = None
+    """Additional evaluation configuration parameters."""
     limit: Optional[Union[int, float]] = None
+    """Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
     eval_batch_size: int = 1
+    """Batch size for evaluation processing."""
     # Cache and working directory arguments
     use_cache: Optional[str] = None
+    """Whether to use cached results and which cache strategy to apply."""
     rerun_review: bool = False
+    """Whether to rerun the review process even if results exist."""
     work_dir: str = DEFAULT_WORK_DIR
+    """Working directory for storing evaluation results and temporary files."""
     # Debug and runtime mode arguments
     ignore_errors: bool = False
+    """Whether to continue evaluation when encountering errors."""
     debug: bool = False
-    dry_run: bool = False
+    """Enable debug mode for detailed logging and error reporting."""
     seed: Optional[int] = 42
-    api_url: Optional[str] = None  # Only used for server model
-    api_key: Optional[str] = 'EMPTY'  # Only used for server model
-    timeout: Optional[float] = None  # Only used for server model
-    stream: Optional[bool] = None  # Only used for server model
+    """Random seed for reproducible results."""
+    api_url: Optional[str] = None
+    """API endpoint URL for server-based model evaluation."""
+    api_key: Optional[str] = 'EMPTY'
+    """API key for authenticating with server-based models."""
+    timeout: Optional[float] = None
+    """Request timeout in seconds for server-based models."""
+    stream: Optional[bool] = None
+    """Whether to use streaming responses for server-based models."""
     # LLMJudge arguments
     judge_strategy: str = JudgeStrategy.AUTO
+    """Strategy for LLM-based judgment (auto, single, pairwise)."""
     judge_worker_num: int = 1
+    """Number of worker processes for parallel LLM judging."""
     judge_model_args: Optional[Dict] = field(default_factory=dict)
+    """Additional arguments for the judge model configuration."""
     analysis_report: bool = False
+    """Whether to generate detailed analysis reports after evaluation."""
+    # Sandbox configuration arguments
+    use_sandbox: bool = False
+    """Whether to execute code in a sandboxed environment."""
+    sandbox_type: Optional[str] = 'docker'
+    """Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
+    sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
+    """Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
+    sandbox_config: Optional[Dict] = field(default_factory=dict)
+    """Configuration for sandboxed code execution environments."""
+    evalscope_version: Optional[str] = evalscope_version
+    """EvalScope version used for the evaluation."""
     def __post_init__(self):
         self.__init_model_and_id()
@@ -82,20 +150,22 @@ class TaskConfig(BaseArgument):
         # Set default generation_config and model_args
         self.__init_default_generation_config()
         self.__init_default_model_args()
+        self.__init_default_sandbox_config()
     def __init_model_and_id(self):
         # Set model to DummyCustomModel if not provided
         if self.model is None:
             self.model = self.model_task
             self.eval_type = EvalType.MOCK_LLM
-        else:
-            if self.model_task == ModelTask.IMAGE_GENERATION:
-                self.eval_type = EvalType.TEXT2IMAGE
         # Set model_id if not provided
         if not self.model_id:
-            if self.model:
+            if isinstance(self.model, str):
                 self.model_id = safe_filename(os.path.basename(self.model))
+            elif isinstance(self.model, Model):
+                self.model_id = safe_filename(self.model.name)
+            elif isinstance(self.model, ModelAPI):
+                self.model_id = safe_filename(self.model.model_name)
             else:
                 self.model_id = 'dummy_model'
@@ -113,6 +183,11 @@ class TaskConfig(BaseArgument):
                     'num_inference_steps': 50,
                     'guidance_scale': 9.0,
                 }
+                if self.eval_batch_size != 1:
+                    logger.warning(
+                        'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
+                    )
+                    self.eval_batch_size = 1
             elif self.model_task == ModelTask.TEXT_GENERATION:
                 if self.eval_type == EvalType.CHECKPOINT:
                     self.generation_config = {
@@ -125,7 +200,6 @@ class TaskConfig(BaseArgument):
                     }
                 elif self.eval_type == EvalType.SERVICE:
                     self.generation_config = {
-                        'max_tokens': 2048,
                         'temperature': 0.0,
                     }
         if isinstance(self.generation_config, dict):
@@ -138,14 +212,14 @@ class TaskConfig(BaseArgument):
         if self.timeout is not None:
             deprecated_warning(
                 logger,
-                'The `timeout` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.timeout` instead.'
+                'The `timeout` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.timeout` instead.'
             )
             self.generation_config.timeout = self.timeout
         if self.stream is not None:
             deprecated_warning(
                 logger,
-                'The `stream` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.stream` instead.'
+                'The `stream` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.stream` instead.'
             )
             self.generation_config.stream = self.stream
@@ -154,7 +228,7 @@ class TaskConfig(BaseArgument):
             self.generation_config.n = 1
             deprecated_warning(
                 logger,
-                'The `n` parameter in generation_config is deprecated and will be removed in v1.1.0. Use `TaskConfig.repeats` instead.'
+                'The `n` parameter in generation_config is deprecated and will be removed in v2.0.0. Use `TaskConfig.repeats` instead.'
             )
     def __init_default_model_args(self):
@@ -167,6 +241,14 @@ class TaskConfig(BaseArgument):
                     'precision': 'torch.float16',
                 }
+    def __init_default_sandbox_config(self):
+        if not self.use_sandbox:
+            return
+        check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
+        if not self.sandbox_type:
+            self.sandbox_type = 'docker'
     def update(self, other: Union['TaskConfig', dict]):
         if isinstance(other, TaskConfig):
             other = other.to_dict()
@@ -182,9 +264,12 @@ class TaskConfig(BaseArgument):
             logger.warning(f'Failed to dump overall task config: {e}')
     def to_dict(self):
-        result = copy.deepcopy(self.__dict__)
+        result = copy.copy(self.__dict__)
         del result['api_key']  # Do not expose api_key in the config
+        if isinstance(self.model, (Model, ModelAPI)):
+            result['model'] = self.model.__class__.__name__
         if isinstance(self.generation_config, GenerateConfig):
             result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
         return result

evalscope/constants.py CHANGED Viewed

@@ -15,6 +15,8 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR  # compatible with old versio
 DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
     os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
 )  # ~/.cache/evalscope
+IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1'  # To avoid some heavy dependencies when building doc
+HEARTBEAT_INTERVAL_SEC = 60  # 60 seconds
 class HubType:
@@ -70,6 +72,7 @@ class EvalType:
     CHECKPOINT = 'llm_ckpt'  # native model checkpoint
     SERVICE = 'openai_api'  # model service
     TEXT2IMAGE = 'text2image'  # image generation service
+    IMAGE_EDITING = 'image_editing'  # image editing service
 class OutputType:
@@ -119,6 +122,7 @@ class Tags:
     CHINESE = 'Chinese'
     COMMONSENSE = 'Commonsense'
     QA = 'QA'
+    NER = 'NER'
     READING_COMPREHENSION = 'ReadingComprehension'
     CUSTOM = 'Custom'
     INSTRUCTION_FOLLOWING = 'InstructionFollowing'
@@ -127,3 +131,17 @@ class Tags:
     RETRIEVAL = 'Retrieval'
     FUNCTION_CALLING = 'FunctionCalling'
     TEXT_TO_IMAGE = 'TextToImage'
+    IMAGE_EDITING = 'ImageEditing'
+    MULTI_MODAL = 'MultiModal'
+    MULTI_LINGUAL = 'MultiLingual'
+    MULTI_TURN = 'MultiTurn'
+    YES_NO = 'Yes/No'
+    HALLUCINATION = 'Hallucination'
+    MEDICAL = 'Medical'
+    AGENT = 'Agent'
+    MT = 'MachineTranslation'
+class FileConstants:
+    IMAGE_PATH = 'image_path'
+    ID = 'id'

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl