PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (273) hide show

evalscope/__init__.py +4 -1
evalscope/api/__init__.py +0 -0
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +3 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
evalscope/api/benchmark/benchmark.py +321 -0
evalscope/api/benchmark/meta.py +115 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +261 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +355 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +264 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +11 -0
evalscope/api/messages/chat_message.py +198 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +105 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/dataset_mixin.py +105 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +157 -0
evalscope/api/model/model.py +383 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +23 -11
evalscope/app/utils/data_utils.py +42 -26
evalscope/app/utils/text_utils.py +0 -2
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +6 -7
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -3
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +2 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +135 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +95 -54
evalscope/constants.py +29 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +277 -423
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +32 -30
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +47 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +123 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +698 -0
evalscope/perf/benchmark.py +2 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +8 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -2
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +101 -6
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +26 -44
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +2 -1
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/io_utils.py +100 -5
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +10 -7
evalscope/utils/multi_choices.py +271 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
tests/aigc/test_t2i.py +22 -4
tests/benchmark/__init__.py +1 -0
tests/benchmark/test_eval.py +386 -0
tests/cli/test_all.py +3 -5
tests/cli/test_collection.py +13 -4
tests/cli/test_custom.py +22 -15
tests/rag/test_clip_benchmark.py +1 -0
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0

evalscope/utils/model_utils.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import numpy as np
-import os
 import random
-import torch
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
@@ -69,8 +67,13 @@ def seed_everything(seed: int):
     """
     random.seed(seed)
     np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
+    try:
+        import torch
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+    except ImportError:
+        pass

evalscope/utils/multi_choices.py ADDED Viewed

@@ -0,0 +1,271 @@
+# flake8: noqa: E501
+from __future__ import annotations
+import re
+from typing import TYPE_CHECKING, List, Optional
+if TYPE_CHECKING:
+    from evalscope.api.evaluator import Choices, Target, TaskState
+FEW_SHOT_TEMPLATE = r"""Here are some examples of how to answer similar questions:
+{fewshot}
+""".lstrip()
+CHINESE_FEW_SHOT_TEMPLATE = r"""以下是一些示例问题：
+{fewshot}
+""".lstrip()
+CHINESE_SINGLE_ANSWER_TEMPLATE = r"""回答下面的单项选择题，请选出其中的正确答案。你的回答的最后一行应该是这样的格式："答案：LETTER"（不带引号），其中 LETTER 是 {letters} 中的一个。
+问题：{question}
+选项：
+{choices}
+""".lstrip()
+CHINESE_SINGLE_ANSWER_TEMPLATE_COT = r"""回答下面的单项选择题，请选出其中的正确答案。你的回答的最后一行应该是这样的格式："答案：LETTER"（不带引号），其中 LETTER 是 {letters} 中的一个。请在回答前进行一步步思考。
+问题：{question}
+选项：
+{choices}
+""".lstrip()
+SINGLE_ANSWER_TEMPLATE = r"""
+Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
+{question}
+{choices}
+""".strip()
+SINGLE_ANSWER_TEMPLATE_COT = r"""
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
+{question}
+{choices}
+""".strip()
+MULTIPLE_ANSWER_TEMPLATE = r"""
+Answer the following multiple choice question where multiple answers may be correct. The entire content of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}.
+{question}
+{choices}
+""".strip()
+MULTIPLE_ANSWER_TEMPLATE_COT = r"""
+Answer the following multiple choice question where multiple answers may be correct. The last line of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}. Think step by step before answering.
+{question}
+{choices}
+""".strip()
+def unshuffle_choices(choices: Choices) -> Choices:
+    # `sorted` returns `list[Choice]`, but for consistency we wrap this back
+    # into a `Choices` object
+    return Choices(sorted(choices, key=lambda choice: choice.original_position))
+def answer_options(choices: Choices) -> str:
+    r"""
+    Returns the `choices` formatted as a multiple choice question, e.g.:
+    ["choice 1", "choice 2", "choice 3"] ->
+        "A) choice 1\nB) choice 2\nC) choice 3"
+    """
+    indexes = list(range(len(choices)))
+    return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
+def prompt(question: str, choices: Choices, template: str, fewshot: Optional[str] = None) -> str:
+    choices_text = answer_options(choices)
+    letters = ','.join(answer_character(i) for i in range(len(choices)))
+    if not fewshot:
+        return template.format(
+            choices=choices_text,
+            letters=letters,
+            question=question,
+        )
+    else:
+        return template.format(
+            choices=choices_text,
+            letters=letters,
+            question=question,
+            fewshot=fewshot,
+        )
+def format_example(
+    question: str,
+    choices: Choices,
+    answer: Target,
+) -> str:
+    """Format a single example for few-shot learning.
+    Args:
+        question (str): The question text.
+        choices (list[str]): The list of choices.
+        answer (list[str]): The correct answers.
+    Returns:
+        str: Formatted example string.
+    """
+    choices_text = answer_options(choices)
+    return f'{question}\n{choices_text}\nANSWER: {answer.text}'
+def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
+    """
+    Convenience function for extracting answers from the state output.
+    The generated response must be in the format 'ANSWER: <answers>',
+    otherwise we can't extract what the model thinks is "true". We can be a
+    bit flexible whether these are "AB" vs "A,B" vs "A B".
+    However, if the answer isn't in the expected format the model has
+    failed in the task so we'll ultimately just mark it as incorrect
+    """
+    # First check whether the string strictly ends with the expected answer
+    # In this case, we're looking for a single line which contains the expected
+    # ANSWER: <answer> string with only whitespace or a period/full stop at the end.
+    match = re.search(
+        r'(?i)^ANSWER\s*:\s*([A-Za-z\d ,]+)\s*(?:$|\n|\.)',
+        state.output.completion,
+        flags=re.MULTILINE,
+    )
+    # If we couldn't match the strict version, we can try the less strict
+    # version for backward compatibility
+    if match is None:
+        match = re.search(
+            r'(?i)ANSWER\s*:\s*([A-Za-z\d ,]+)(?:[^\w]|\n|$|\.)',
+            state.output.completion,
+        )
+    if match is None:
+        return set()
+    matched = match.group(1)
+    # Strip trailing period / full stop
+    matched = matched.strip()
+    matched = matched.rstrip('.')
+    allowed_options = set(answer_character(i) for i in range(len(state.choices)))
+    if multiple_correct:
+        # Match must contain only the allowed choices
+        # (may be separated by commas, spaces, the word 'and', or nothing at all)
+        matched = matched.replace(' and ', '')
+        matched = matched.replace(' ', '')
+        split_comma = set(matched.split(','))
+        if split_comma.issubset(allowed_options):
+            answers = split_comma
+            return answers
+        split_nothing = set(matched)
+        if split_nothing.issubset(allowed_options):
+            answers = split_nothing
+            return answers
+    else:
+        # Match must contain a single letter in the allowed choices
+        if matched in allowed_options:
+            answers = {matched}
+            return answers
+    return set()
+def parse_answers_zh(state: TaskState, multiple_correct: bool = False) -> set[str]:
+    """
+    Convenience function for extracting answers from the state output in Chinese format.
+    The generated response must be in the format '答案：选项',
+    otherwise we can't extract what the model thinks is "true". We can be a
+    bit flexible whether these are "AB" vs "A,B" vs "A B".
+    """
+    # Simple pattern to capture answers with optional bold markdown
+    pattern = r'答案\s*[:：]\s*([A-Za-z0-9,，]+)'
+    match = re.search(pattern, state.output.completion, flags=re.MULTILINE)
+    if match is None:
+        return set()
+    matched = match.group(1).strip().rstrip('。.')
+    allowed_options = set(answer_character(i) for i in range(len(state.choices)))
+    if multiple_correct:
+        # Handle comma-separated or continuous letters
+        matched = matched.replace(' 和 ', '').replace(' ', '').replace('，', ',')
+        answers = set(matched.split(',')) if ',' in matched else set(matched)
+        return answers if answers.issubset(allowed_options) else set()
+    else:
+        # Single answer
+        return {matched} if matched in allowed_options else set()
+def set_choices_based_on_generated_response(state: TaskState, answers: set[str]) -> None:
+    true_answers = [answer_index(letter) for letter in answers]
+    for i in range(len(state.choices)):
+        if i in true_answers:
+            state.choices.mark_choice(i, True)
+        else:
+            state.choices.mark_choice(i, False)
+def valid_template(template: str) -> bool:
+    """Check if a template has the required capture groups for a multiple choice question"""
+    return bool(re.search(r'\{question\}', template) and re.search(r'\{choices\}', template))
+class MultipleChoiceTemplate:
+    """
+    Templates for multiple choice questions.
+    """
+    SINGLE_ANSWER = SINGLE_ANSWER_TEMPLATE
+    SINGLE_ANSWER_COT = SINGLE_ANSWER_TEMPLATE_COT
+    MULTIPLE_ANSWER = MULTIPLE_ANSWER_TEMPLATE
+    MULTIPLE_ANSWER_COT = MULTIPLE_ANSWER_TEMPLATE_COT
+    CHINESE_FEW_SHOT_TEMPLATE = CHINESE_FEW_SHOT_TEMPLATE
+    CHINESE_SINGLE_ANSWER_TEMPLATE = CHINESE_SINGLE_ANSWER_TEMPLATE
+    CHINESE_SINGLE_ANSWER_TEMPLATE_COT = CHINESE_SINGLE_ANSWER_TEMPLATE_COT
+def answer_character(index: int) -> str:
+    r"""
+    Helper to go from array index to char, for example:
+        0 -> 'A', 1 -> 'B', etc
+    """
+    if index < 26:
+        return chr(ord('A') + index)
+    else:
+        return str(index - 25)
+def answer_index(char: str) -> int:
+    r"""
+    Helper to go from char to array index, for example:
+        'A' -> 0, 'B' -> 1, etc
+    """
+    if char.isalpha() or char == ',' or char == ' ':
+        return ord(char.upper()) - ord('A')
+    elif char.isnumeric():
+        return 25 + int(char)
+    else:
+        raise ValueError(f'Unepxected multiple choice answer: {char} (must be a letter or number)')

evalscope/utils/url_utils.py ADDED Viewed

@@ -0,0 +1,65 @@
+import base64
+import httpx
+import mimetypes
+import re
+def is_http_url(url: str) -> bool:
+    return url.startswith('http://') or url.startswith('https://')
+def is_data_uri(url: str) -> bool:
+    pattern = r'^data:([^;]+);base64,.*'
+    return re.match(pattern, url) is not None
+def data_uri_mime_type(data_url: str) -> str | None:
+    pattern = r'^data:([^;]+);.*'
+    match = re.match(pattern, data_url)
+    if match:
+        mime_type = match.group(1)
+        return mime_type
+    else:
+        return None
+def data_uri_to_base64(data_uri: str) -> str:
+    pattern = r'^data:[^,]+,'
+    stripped_uri = re.sub(pattern, '', data_uri)
+    return stripped_uri
+def file_as_data(file: str) -> tuple[bytes, str]:
+    if is_data_uri(file):
+        # resolve mime type and base64 content
+        mime_type = data_uri_mime_type(file) or 'image/png'
+        file_base64 = data_uri_to_base64(file)
+        file_bytes = base64.b64decode(file_base64)
+    else:
+        # guess mime type; need strict=False for webp images
+        type, _ = mimetypes.guess_type(file, strict=False)
+        if type:
+            mime_type = type
+        else:
+            mime_type = 'image/png'
+        # handle url or file
+        if is_http_url(file):
+            client = httpx.Client()
+            file_bytes = client.get(file).content
+        else:
+            with open(file, 'rb') as f:
+                file_bytes = f.read()
+    # return bytes and type
+    return file_bytes, mime_type
+def file_as_data_uri(file: str) -> str:
+    if is_data_uri(file):
+        return file
+    else:
+        bytes, mime_type = file_as_data(file)
+        base64_file = base64.b64encode(bytes).decode('utf-8')
+        file = f'data:{mime_type};base64,{base64_file}'
+        return file

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.17.1'
-__release_datetime__ = '2025-07-18 17:00:00'
+__version__ = '1.0.0'
+__release_datetime__ = '2025-08-25 12:00:00'

{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.17.1
+Version: 1.0.0
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -18,7 +18,10 @@ Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: accelerate
-Requires-Dist: datasets==3.2.0
+Requires-Dist: colorlog
+Requires-Dist: datasets==3.6.0
+Requires-Dist: docstring-parser
+Requires-Dist: dotenv
 Requires-Dist: immutabledict
 Requires-Dist: jieba
 Requires-Dist: jsonlines
@@ -28,11 +31,14 @@ Requires-Dist: matplotlib
 Requires-Dist: modelscope[framework]>=1.27
 Requires-Dist: nltk>=3.9
 Requires-Dist: openai
+Requires-Dist: overrides
 Requires-Dist: pandas
 Requires-Dist: pillow
 Requires-Dist: pyarrow
+Requires-Dist: pydantic
 Requires-Dist: pyyaml>=5.1
 Requires-Dist: requests
+Requires-Dist: rich
 Requires-Dist: rouge-chinese
 Requires-Dist: rouge-score>=0.1.0
 Requires-Dist: sacrebleu
@@ -50,10 +56,14 @@ Requires-Dist: iopath; extra == "aigc"
 Requires-Dist: omegaconf; extra == "aigc"
 Requires-Dist: open-clip-torch; extra == "aigc"
 Requires-Dist: opencv-python; extra == "aigc"
+Requires-Dist: peft>=0.17; extra == "aigc"
 Requires-Dist: torchvision; extra == "aigc"
 Provides-Extra: all
 Requires-Dist: accelerate; extra == "all"
-Requires-Dist: datasets==3.2.0; extra == "all"
+Requires-Dist: colorlog; extra == "all"
+Requires-Dist: datasets==3.6.0; extra == "all"
+Requires-Dist: docstring-parser; extra == "all"
+Requires-Dist: dotenv; extra == "all"
 Requires-Dist: immutabledict; extra == "all"
 Requires-Dist: jieba; extra == "all"
 Requires-Dist: jsonlines; extra == "all"
@@ -63,11 +73,14 @@ Requires-Dist: matplotlib; extra == "all"
 Requires-Dist: modelscope[framework]>=1.27; extra == "all"
 Requires-Dist: nltk>=3.9; extra == "all"
 Requires-Dist: openai; extra == "all"
+Requires-Dist: overrides; extra == "all"
 Requires-Dist: pandas; extra == "all"
 Requires-Dist: pillow; extra == "all"
 Requires-Dist: pyarrow; extra == "all"
+Requires-Dist: pydantic; extra == "all"
 Requires-Dist: pyyaml>=5.1; extra == "all"
 Requires-Dist: requests; extra == "all"
+Requires-Dist: rich; extra == "all"
 Requires-Dist: rouge-chinese; extra == "all"
 Requires-Dist: rouge-score>=0.1.0; extra == "all"
 Requires-Dist: sacrebleu; extra == "all"
@@ -91,7 +104,6 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
 Requires-Dist: aiohttp; extra == "all"
 Requires-Dist: fastapi; extra == "all"
 Requires-Dist: numpy; extra == "all"
-Requires-Dist: rich; extra == "all"
 Requires-Dist: sse-starlette; extra == "all"
 Requires-Dist: transformers; extra == "all"
 Requires-Dist: uvicorn; extra == "all"
@@ -102,8 +114,9 @@ Requires-Dist: iopath; extra == "all"
 Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: open-clip-torch; extra == "all"
 Requires-Dist: opencv-python; extra == "all"
+Requires-Dist: peft>=0.17; extra == "all"
 Requires-Dist: torchvision; extra == "all"
-Requires-Dist: bfcl-eval; extra == "all"
+Requires-Dist: bfcl-eval==2025.6.16; extra == "all"
 Requires-Dist: human-eval; extra == "all"
 Requires-Dist: pytest; extra == "all"
 Requires-Dist: pytest-cov; extra == "all"
@@ -112,7 +125,7 @@ Provides-Extra: app
 Requires-Dist: gradio==5.4.0; extra == "app"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
 Provides-Extra: dev
-Requires-Dist: bfcl-eval; extra == "dev"
+Requires-Dist: bfcl-eval==2025.6.16; extra == "dev"
 Requires-Dist: human-eval; extra == "dev"
 Requires-Dist: pytest; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
@@ -175,9 +188,9 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
 - [📝 Introduction](#-introduction)
 - [☎ User Groups](#-user-groups)
 - [🎉 News](#-news)
-- [🛠️ Installation](#️-installation)
-  - [Method 1: Install Using pip](#method-1-install-using-pip)
-  - [Method 2: Install from Source](#method-2-install-from-source)
+- [🛠️ Environment Setup](#️-environment-setup)
+  - [Method 1. Install via pip](#method-1-install-via-pip)
+  - [Method 2. Install from source](#method-2-install-from-source)
 - [🚀 Quick Start](#-quick-start)
   - [Method 1. Using Command Line](#method-1-using-command-line)
   - [Method 2. Using Python Code](#method-2-using-python-code)
@@ -258,6 +271,13 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
+> [!IMPORTANT]
+> **Version 1.0 Refactoring**
+>
+> Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
+- 🔥 **[2025.08.22]** Version 1.0 Refactoring.
 - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
 - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
 - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
@@ -268,12 +288,12 @@ Please scan the QR code below to join our community groups:
 - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
 - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
 - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
+<details><summary>More</summary>
 - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
 - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
 - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
 - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
-<details><summary>More</summary>
 - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
 - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
 - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -306,58 +326,87 @@ Please scan the QR code below to join our community groups:
 </details>
-## 🛠️ Installation
-### Method 1: Install Using pip
-We recommend using conda to manage your environment and installing dependencies with pip:
+## 🛠️ Environment Setup
+### Method 1. Install via pip
+We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
 1. Create a conda environment (optional)
+```shell
+# Python 3.10 is recommended
+conda create -n evalscope python=3.10
+# Activate the conda environment
+conda activate evalscope
+```
+2. Install dependencies via pip
+```shell
+pip install evalscope
+```
+3. Install additional dependencies (optional)
+  - To use model service inference benchmarking features, install the perf dependency:
     ```shell
-    # It is recommended to use Python 3.10
-    conda create -n evalscope python=3.10
-    # Activate the conda environment
-    conda activate evalscope
+    pip install 'evalscope[perf]'
     ```
-2. Install dependencies using pip
+  - To use visualization features, install the app dependency:
+    ```shell
+    pip install 'evalscope[app]'
+    ```
+  - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
+    ```shell
+    pip install 'evalscope[opencompass]'
+    pip install 'evalscope[vlmeval]'
+    pip install 'evalscope[rag]'
+    ```
+  - To install all dependencies:
     ```shell
-    pip install evalscope                # Install Native backend (default)
-    # Additional options
-    pip install 'evalscope[opencompass]'   # Install OpenCompass backend
-    pip install 'evalscope[vlmeval]'       # Install VLMEvalKit backend
-    pip install 'evalscope[rag]'           # Install RAGEval backend
-    pip install 'evalscope[perf]'          # Install dependencies for the model performance testing module
-    pip install 'evalscope[app]'           # Install dependencies for visualization
-    pip install 'evalscope[all]'           # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
+    pip install 'evalscope[all]'
     ```
-> [!WARNING]
-> As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
+> [!NOTE]
+> The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
 > ```shell
-> pip install llmuses<=0.4.3
+>  pip install llmuses<=0.4.3
 > ```
-> To import relevant dependencies using `llmuses`:
-> ``` python
+> Then, import related dependencies using `llmuses`:
+> ```python
 > from llmuses import ...
 > ```
-### Method 2: Install from Source
-1. Download the source code
-    ```shell
-    git clone https://github.com/modelscope/evalscope.git
-    ```
+### Method 2. Install from source
+Installing from source allows you to use the latest code and makes it easier for further development and debugging.
+1. Clone the source code
+```shell
+git clone https://github.com/modelscope/evalscope.git
+```
 2. Install dependencies
-    ```shell
-    cd evalscope/
-    pip install -e .                  # Install Native backend
-    # Additional options
-    pip install -e '.[opencompass]'   # Install OpenCompass backend
-    pip install -e '.[vlmeval]'       # Install VLMEvalKit backend
-    pip install -e '.[rag]'           # Install RAGEval backend
-    pip install -e '.[perf]'          # Install Perf dependencies
-    pip install -e '.[app]'           # Install visualization dependencies
-    pip install -e '.[all]'           # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
-    ```
+```shell
+cd evalscope/
+pip install -e .
+```
+3. Install additional dependencies
+ - To use model service inference benchmarking features, install the perf dependency:
+   ```shell
+   pip install '.[perf]'
+   ```
+ - To use visualization features, install the app dependency:
+   ```shell
+   pip install '.[app]'
+   ```
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
+   ```shell
+   pip install '.[opencompass]'
+   pip install '.[vlmeval]'
+   pip install '.[rag]'
+   ```
+ - To install all dependencies:
+   ```shell
+   pip install '.[all]'
+   ```
 ## 🚀 Quick Start

evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl