PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/coin_flip/coin_flip_adapter.py ADDED Viewed

@@ -0,0 +1,128 @@
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessageUser, Content, ContentText
+from evalscope.api.metric.scorer import AggScore, SampleScore, Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+DESCRIPTION = (
+    "CoinFlip is a symbolic reasoning dataset that tests an LLM's ability "
+    'to track binary state changes through a sequence of actions. '
+    'Each example describes whether a coin is flipped or not by different person, '
+    'requiring logical inference to determine the final state (heads or tails).'
+)  # noqa: E501
+PROMPT_TEMPLATE = """
+Solve the following coin flip problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
+{question}
+Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer YES or NO to the problem.
+Reasoning:
+"""  # noqa: E501
+FEWSHOT_TEMPLATE = """
+Here are some examples of how to solve similar problems:
+{fewshot}
+""".lstrip() + PROMPT_TEMPLATE  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='coin_flip',
+        pretty_name='CoinFlip',
+        tags=[Tags.REASONING, Tags.YES_NO],
+        description=DESCRIPTION.strip(),
+        dataset_id='extraordinarylab/coin-flip',
+        metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
+        aggregation='f1',
+        few_shot_num=0,
+        train_split='validation',
+        eval_split='test',
+        prompt_template=PROMPT_TEMPLATE,
+        few_shot_prompt_template=FEWSHOT_TEMPLATE,
+    )
+)
+class CoinFlipAdapter(DefaultDataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_overall_metric = False
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        question = record['question']
+        answer = record['answer']
+        input_text = self.prompt_template.format(question=question)
+        content_list: List[Content] = [ContentText(text=input_text)]
+        answer = str(answer).upper()  # 'YES' or 'NO'
+        return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
+            'answer': answer,
+        })
+    def extract_answer(self, prediction, task_state):
+        import re
+        match = re.search(r'ANSWER:\s*(.*)', prediction)
+        return match.group(1) if match else prediction
+    def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        # Check for an exact match against the extracted answer.
+        result = 1 if reference in filtered_prediction else 0
+        score.value = {'acc': result}
+        return score
+    def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
+        """
+        Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
+        """
+        tp = fp = tn = fn = 0
+        yes_count = 0
+        total_count = len(sample_scores)
+        for ss in sample_scores:
+            gt = ss.sample_metadata['answer'].strip().upper()
+            pred = ss.score.extracted_prediction.strip().upper()
+            if pred == 'YES':
+                yes_count += 1
+            if pred == 'YES' and gt == 'YES':
+                tp += 1
+            elif pred == 'YES' and gt == 'NO':
+                fp += 1
+            elif pred == 'NO' and gt == 'NO':
+                tn += 1
+            elif pred == 'NO' and gt == 'YES':
+                fn += 1
+        accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
+        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
+        yes_ratio = yes_count / total_count if total_count > 0 else 0.0
+        overall_metrics = {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1_score': f1_score,
+            'yes_ratio': yes_ratio
+        }
+        agg_scores = []
+        for metric_name, value in overall_metrics.items():
+            agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
+        return agg_scores

evalscope/benchmarks/commonsense_qa/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py ADDED Viewed

@@ -0,0 +1,32 @@
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.multi_choices import MultipleChoiceTemplate
+DESCRIPTION = 'CommonsenseQA requires different types of commonsense knowledge to predict the correct answers.'
+@register_benchmark(
+    BenchmarkMeta(
+        name='commonsense_qa',
+        pretty_name='CommonsenseQA',
+        tags=[Tags.REASONING, Tags.COMMONSENSE, Tags.MULTIPLE_CHOICE],
+        description=DESCRIPTION.strip(),
+        dataset_id='extraordinarylab/commonsense-qa',
+        metric_list=['acc'],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='validation',
+        prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
+    )
+)
+class CommonsenseQAAdapter(MultiChoiceAdapter):
+    def record_to_sample(self, record) -> Sample:
+        return Sample(
+            input=record['question'],
+            choices=record['choices'],
+            target=record['answer'],
+            metadata={},
+        )

evalscope/benchmarks/competition_math/competition_math_adapter.py CHANGED Viewed

@@ -71,3 +71,8 @@ class CompetitionMathAdapter(DefaultDataAdapter):
     def sample_to_fewshot(self, sample: Sample) -> str:
         return f'Problem:\n{sample.input}\nSolution:\n{sample.target}'
+    def extract_answer(self, prediction: str, task_state):
+        from evalscope.metrics.math_parser import extract_answer
+        return extract_answer(prediction)

evalscope/benchmarks/data_collection/data_collection_adapter.py CHANGED Viewed

@@ -6,9 +6,7 @@ from typing import Any, Dict, List
 from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
 from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
 from evalscope.api.evaluator import TaskState
-from evalscope.api.metric import Score
 from evalscope.api.metric.scorer import AggScore, SampleScore
-from evalscope.api.model.model import Model
 from evalscope.api.registry import get_benchmark, register_benchmark
 from evalscope.config import TaskConfig
 from evalscope.constants import DataCollection, Tags
@@ -22,8 +20,13 @@ logger = get_logger()
 @register_benchmark(
     BenchmarkMeta(
         name=DataCollection.NAME,
+        pretty_name='Data-Collection',
         dataset_id='',  # dataset_id need to be set
-        description='Data collection',
+        description='Custom Data collection, mixing multiple evaluation datasets for '
+        'a unified evaluation, aiming to use less data to achieve a more comprehensive '
+        'assessment of the model\'s capabilities. '
+        '[Usage Reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html)',
+        tags=[Tags.CUSTOM],
         metric_list=['acc'],
         eval_split='test',
         prompt_template='',
@@ -55,9 +58,10 @@ class DataCollectionAdapter(DefaultDataAdapter):
             data_id_or_path=dataset_path,
             split=self.eval_split,
             sample_fields=self.record_to_sample,
-            subset=self.default_subset,
+            subset='test',  # NOTE: using hardcoded test subset
             limit=self.limit,
-            repeats=self.repeats
+            repeats=self.repeats,
+            shuffle=self.shuffle,
         ).load()
         test_dataset = DatasetDict({self.default_subset: dataset})
@@ -95,7 +99,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
         # load dataset args
         dataset_args = copy.deepcopy(self._task_config.dataset_args)
-        common_args = dataset_args.get(DataCollection.NAME, {})
         # Iterate through each sample in the dataset
         dataset = self.test_dataset[self.default_subset]
@@ -108,7 +111,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
             # update dataset args
             cur_dataset_args = dataset_args.get(dataset_name, {})
-            cur_dataset_args.update(common_args)
             # Initialize dataset adapter
             if dataset_name not in self.dataset_adapters:
@@ -141,19 +143,22 @@ class DataCollectionAdapter(DefaultDataAdapter):
         data = []
         for sample_score in sample_scores:
             collection_info = sample_score.sample_metadata[DataCollection.INFO]
-            for metric_name, value in sample_score.score.value.items():
-                data.append(
-                    dict(
-                        task_type=collection_info['task_type'],
-                        categories=tuple(collection_info['categories']),
-                        dataset_name=collection_info['dataset_name'],
-                        subset_name=collection_info['subset_name'],
-                        tags=collection_info['tags'],
-                        sample_id=sample_score.sample_id,
-                        metric=metric_name,
-                        score=value
-                    )
+            main_score = sample_score.score.main_value
+            main_metric = sample_score.score.main_score_name
+            # use main score
+            data.append(
+                dict(
+                    task_type=collection_info['task_type'],
+                    categories=tuple(collection_info['categories']),
+                    dataset_name=collection_info['dataset_name'],
+                    subset_name=collection_info['subset_name'],
+                    tags=collection_info['tags'],
+                    sample_id=sample_score.sample_id,
+                    metric=main_metric,
+                    score=main_score
                 )
+            )
         df = pd.DataFrame(data)

evalscope/benchmarks/docvqa/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/docvqa/docvqa_adapter.py ADDED Viewed

@@ -0,0 +1,67 @@
+import json
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator.state import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+PROMPT = """Answer the question according to the image using a single word or phrase.
+{question}
+The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question."""  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='docvqa',
+        pretty_name='DocVQA',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.',  # noqa: E501
+        dataset_id='lmms-lab/DocVQA',
+        subset_list=['DocVQA'],
+        metric_list=['anls'],
+        eval_split='validation',
+        prompt_template=PROMPT,
+    )
+)
+class DocVQAAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_aggregation_name = False
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        input_text = PROMPT.format(question=record['question'])
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=json.dumps(record.get('answers')),  # answers is a list
+            metadata={
+                'questionId': record.get('questionId'),
+                'question_types': record.get('question_types'),
+                'docId': record.get('docId'),
+                'ucsf_document_id': record.get('ucsf_document_id'),
+                'ucsf_document_page_no': record.get('ucsf_document_page_no'),
+            }
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        import re
+        pattern = r'ANSWER:\s*(.*)'
+        match = re.search(pattern, prediction)
+        if match:
+            return match.group(1).strip()
+        return prediction.strip()

evalscope/benchmarks/drivelology/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/drivelology/drivelology_binary_adapter.py ADDED Viewed

@@ -0,0 +1,170 @@
+# flake8: noqa: E501
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessageUser, Content, ContentText
+from evalscope.api.metric.scorer import AggScore, SampleScore, Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
+DESCRIPTION = (
+    'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
+    'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
+    'or rhetorically subversive.'
+)
+PROMPT_TEMPLATE = """
+#Instruction#:
+Classify whether the given text is a Drivelology sample or not.
+#Definition#:
+- Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
+These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
+often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
+emotional insight to unravel their true significance.
+- non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
+statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
+or proverbs, that convey clear or straightforward information without the layered complexity
+characteristic of Drivelology.
+#Output Format#:
+You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
+The answer you give MUST be \"Yes\" or \"No\"".
+#Input Text#: {text}
+#Your Answer#:
+""".strip()  # noqa: E501
+FEWSHOT_PROMPT_TEMPLATE = """
+#Instruction#:
+Classify whether the given text is a Drivelology sample or not.
+#Definition#:
+- Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
+These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
+often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
+emotional insight to unravel their true significance.
+- non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
+statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
+or proverbs, that convey clear or straightforward information without the layered complexity
+characteristic of Drivelology.
+#Output Format#:
+You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
+The answer you give MUST be \"Yes\" or \"No\"".
+Here are some examples of how to solve similar problems:
+#Input Text#: Saw a book called "how to solve 50 percent of your problems" so I bought 2 books.
+#Your Answer#: Yes
+#Input Text#: Colourless green ideas sleep furiously.
+#Your Answer#: No
+#Input Text#: I went to a restaurant, and saw this guy was choking. I gotta save him. And then I realized he was just speaking French.
+#Your Answer#: Yes
+#Input Text#: Either it is or it isn't.
+#Your Answer#: No
+#Input Text#: {text}
+#Your Answer#:
+""".strip()  # noqa: E501
+logger = get_logger()
+@register_benchmark(
+    BenchmarkMeta(
+        name='drivel_binary',
+        pretty_name='DrivelologyBinaryClassification',
+        tags=[Tags.YES_NO],
+        description=DESCRIPTION.strip(),
+        dataset_id='extraordinarylab/drivel-hub',
+        subset_list=['binary-classification'],
+        metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
+        aggregation='f1',
+        few_shot_num=0,
+        eval_split='test',
+        prompt_template='{question}',
+        few_shot_prompt_template='{question}'
+    )
+)
+class DrivelologyBinaryClassificationAdapter(DefaultDataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_overall_metric = False
+        if self.few_shot_num not in [0, 4]:
+            logger.warning(f'For DrivelologyBinaryClassification, use 4-shot by default.')
+            self.few_shot_num = 4
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        if self.few_shot_num > 0:
+            prompt = FEWSHOT_PROMPT_TEMPLATE.format(text=record['text'])
+        else:
+            prompt = PROMPT_TEMPLATE.format(text=record['text'])
+        content_list: List[Content] = [ContentText(text=prompt)]
+        answer = 'YES' if str(record['label']) == 'drivelology' else 'NO'  # 'YES' or 'NO'
+        return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
+            'answer': answer,
+        })
+    def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        # Check if the reference answer is in the filtered prediction
+        result = 1 if reference in filtered_prediction.strip().upper() else 0
+        score.value = {'acc': result}
+        return score
+    def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
+        """
+        Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
+        """
+        def compute_metrics(scores: List[SampleScore]):
+            tp = fp = tn = fn = 0
+            yes_count = 0
+            total_count = len(scores)
+            for ss in scores:
+                gt = ss.sample_metadata['answer'].strip().upper()
+                # Get prediction based on score
+                pred = gt if ss.score.main_value == 1 else ('NO' if gt == 'YES' else 'YES')
+                if pred == 'YES':
+                    yes_count += 1
+                if pred == 'YES' and gt == 'YES':
+                    tp += 1
+                elif pred == 'YES' and gt == 'NO':
+                    fp += 1
+                elif pred == 'NO' and gt == 'NO':
+                    tn += 1
+                elif pred == 'NO' and gt == 'YES':
+                    fn += 1
+            accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
+            precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+            recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+            f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
+            yes_ratio = yes_count / total_count if total_count > 0 else 0.0
+            return {
+                'accuracy': accuracy,
+                'precision': precision,
+                'recall': recall,
+                'f1_score': f1_score,
+                'yes_ratio': yes_ratio
+            }
+        overall_metrics = compute_metrics(sample_scores)
+        agg_scores = []
+        for metric_name, value in overall_metrics.items():
+            agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
+        return agg_scores

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl