PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (273) hide show

evalscope/__init__.py +4 -1
evalscope/api/__init__.py +0 -0
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +3 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
evalscope/api/benchmark/benchmark.py +321 -0
evalscope/api/benchmark/meta.py +115 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +261 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +355 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +264 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +11 -0
evalscope/api/messages/chat_message.py +198 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +105 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/dataset_mixin.py +105 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +157 -0
evalscope/api/model/model.py +383 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +23 -11
evalscope/app/utils/data_utils.py +42 -26
evalscope/app/utils/text_utils.py +0 -2
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +6 -7
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -3
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +2 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +135 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +95 -54
evalscope/constants.py +29 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +277 -423
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +32 -30
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +47 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +123 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +698 -0
evalscope/perf/benchmark.py +2 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +8 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -2
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +101 -6
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +26 -44
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +2 -1
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/io_utils.py +100 -5
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +10 -7
evalscope/utils/multi_choices.py +271 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
tests/aigc/test_t2i.py +22 -4
tests/benchmark/__init__.py +1 -0
tests/benchmark/test_eval.py +386 -0
tests/cli/test_all.py +3 -5
tests/cli/test_collection.py +13 -4
tests/cli/test_custom.py +22 -15
tests/rag/test_clip_benchmark.py +1 -0
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/docmath/docmath_adapter.py CHANGED Viewed

@@ -1,6 +1,14 @@
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType
-from evalscope.metrics import LLMJudge
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
+logger = get_logger()
 TEMPLATE_0SHOT = """Please read the following text and answer the question below.
@@ -13,73 +21,123 @@ TEMPLATE_0SHOT = """Please read the following text and answer the question below
 Format your response as follows: "Therefore, the answer is (insert answer here)"."""
-@Benchmark.register(
-    name='docmath',
-    pretty_name='DocMath',
-    tags=['Reasoning', 'Mathematics', 'Long Context'],
-    description=
-    'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.',  # noqa: E501
-    dataset_id='yale-nlp/DocMath-Eval',
-    metric_list=['AverageAccuracy'],
-    subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-    prompt_template=TEMPLATE_0SHOT,
+@register_benchmark(
+    BenchmarkMeta(
+        name='docmath',
+        pretty_name='DocMath',
+        tags=[Tags.REASONING, Tags.MATH, Tags.LONG_CONTEXT],
+        description=
+        'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.',  # noqa: E501
+        dataset_id='yale-nlp/DocMath-Eval',
+        metric_list=['acc'],
+        subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
+        eval_split='test',
+        prompt_template=TEMPLATE_0SHOT,
+    )
 )
-class DocMathAdapter(DataAdapter):
+class DocMathAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self._use_llm_judge = True  # Enable LLM judge for DocMath
+        self.split_as_subset = True  # Use split as subset for DocMath
-    def load(self, **kwargs):
-        # default load mini test
-        kwargs['split_as_subset'] = True
-        data_dict = super().load(**kwargs)
-        return data_dict
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
-        """
-        Generate model prompt from input data.
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
         """
-        context = context = '\n'.join(input_d['paragraphs'])
-        question = input_d['question']
-        prompt = self.prompt_template.format(context=context, question=question)
-        return self.gen_prompt_data(prompt)
+        Convert a data record to a Sample object.
-    def get_gold_answer(self, input_d: dict) -> str:
-        """
-        Parse the raw input labels (gold).
-        """
-        return input_d['ground_truth']
+        Args:
+            record (Dict[str, Any]): Input data record.
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        Returns:
+            Sample: Sample object with input, target, and metadata.
+        """
+        ground_truth = record['ground_truth']
+        return Sample(
+            input=record['question'],
+            target=str(ground_truth),
+            metadata={
+                'question_id': record.get('question_id', ''),
+                'paragraphs': record['paragraphs'],
+                'answer_type': type(ground_truth).__name__
+            }
+        )
+    def format_prompt_template(self, sample):
+        context = '\n'.join(sample.metadata['paragraphs'])
+        question = sample.input
+        return self.prompt_template.format(context=context, question=question)
+    def extract_answer(self, prediction: str, task_state: TaskState):
         """
-        Parse the predicted result and extract proper answer.
+        Extract the answer from the model prediction.
         """
         from .utils import extract_answer
-        extracted_answer = extract_answer(result)
+        extracted_answer = extract_answer(prediction)
         return extracted_answer
-    def match(self, gold: str, pred: str) -> float:
+    def match_score(
+        self,
+        original_prediction: str,
+        filtered_prediction: str,
+        reference: str,
+        task_state: TaskState,
+    ) -> Score:
         """
-        Match the gold answer and the predicted answer.
+        Calculate accuracy score by matching prediction with reference.
         """
         from .utils import get_acc
-        return get_acc(prediction=pred, gt=gold)
-    def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        answer_type = task_state.metadata.get('answer_type', 'unknown')
+        accuracy = get_acc(prediction=filtered_prediction, gt=reference, answer_type=answer_type)
+        score.value = {'acc': accuracy}
+        score.main_score_name = 'acc'
+        return score
+    def llm_match_score(
+        self,
+        original_prediction: str,
+        filtered_prediction: str,
+        reference: str,
+        task_state: TaskState,
+    ) -> Score:
+        """
+        Use LLM judge to evaluate the prediction against the reference.
+        """
         from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
-        raw_input = kwargs.get('raw_input', None)
-        question = raw_input['question']
-        # get grading response
-        prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
-        orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
-        # parse grading response
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        question = task_state.metadata.get('question', '')
+        # Get grading response
+        prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
+        orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
+        # Parse grading response
         if 'YES' in orm_response:
-            return 1.0
+            accuracy = 1.0
         else:
-            return 0.0
+            accuracy = 0.0
+        score.value = {'acc': accuracy}
+        score.explanation = f'LLM judge: {orm_response}'
+        score.metadata = {
+            'source': 'llm_judge',
+            'judge_strategy': self.judge_strategy,
+            'model': self.llm_judge.model_id
+        }
+        score.main_score_name = 'acc'
+        return score

evalscope/benchmarks/docmath/utils.py CHANGED Viewed

@@ -193,23 +193,22 @@ def compare_two_numbers(p, gt):
     return within_eps(pred=p, gt=gt)
-def get_acc(prediction, gt, cot=True):
+def get_acc(prediction, gt, answer_type, cot=True):
     try:
         if cot:
             prediction = normalize(prediction)
         else:
             prediction = float(prediction)
-        answer_type = type(gt).__name__
         assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
         if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
             # Comparing prediction against the reference
             if answer_type in ['bool']:
-                acc = int(prediction == gt)
+                acc = int(prediction == bool(gt))
             elif answer_type == 'int':
-                acc = int(compare_two_numbers(prediction, gt))
+                acc = int(compare_two_numbers(prediction, int(gt)))
             elif answer_type == 'float' or answer_type == 'float64':
-                acc = int(compare_two_numbers(prediction, gt))
+                acc = int(compare_two_numbers(prediction, float(gt)))
             else:
                 acc = 0
         else:

evalscope/benchmarks/drop/drop_adapter.py CHANGED Viewed

@@ -1,8 +1,13 @@
+import ast
 import re
-from typing import List
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -28,54 +33,82 @@ Answer:  43
 '''  # noqa: E501
-@Benchmark.register(
-    name='drop',
-    pretty_name='DROP',
-    tags=['Reasoning'],
-    description=
-    'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.',  # noqa: E501
-    dataset_id='AI-ModelScope/DROP',
-    metric_list=['AverageAccuracy'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='validation',
-    prompt_template=
-    'You will be asked to read a passage and answer a question.{drop_examples}# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.',  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='drop',
+        pretty_name='DROP',
+        tags=[Tags.REASONING],
+        description=
+        'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.',  # noqa: E501
+        dataset_id='AI-ModelScope/DROP',
+        metric_list=['acc'],
+        few_shot_num=3,
+        train_split=None,
+        eval_split='validation',
+        prompt_template=
+        'You will be asked to read a passage and answer a question. {drop_examples}\n# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.',  # noqa: E501
+    )
 )
-class DROPAdapter(DataAdapter):
+class DROPAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        few_shot_num = kwargs.get('few_shot_num', 0)
-        if few_shot_num != 0:
+        if self.few_shot_num != 0:
             self.few_shot_num = 3
             logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
         else:
             self.few_shot_num = 0
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
         """
-        Generate model prompt from input data.
+        Convert a data record to a Sample object.
+        Args:
+            record (Dict[str, Any]): Input data record.
+        Returns:
+            Sample: Sample object with input, target, and metadata.
         """
-        drop_examples = '' if self.few_shot_num == 0 else DROP_EXAMPLES
-        query = f"Passage: {input_d['passage']}\nQuestion: {input_d['question']}"
-        prompt = self.prompt_template.format(
+        # Parse gold answers
+        gold_answers = self._get_gold_answers(record)
+        return Sample(
+            input=record['question'],
+            target=str(gold_answers),
+            metadata={
+                'passage': record['passage'],
+                'answer': record['answer'],
+                'validated_answers': record['validated_answers']
+            }
+        )
+    def format_prompt_template(self, sample: Sample) -> str:
+        drop_examples = ''
+        query = f"Passage: {sample.metadata['passage']}\nQuestion: {sample.input}"
+        return self.prompt_template.format(
             drop_examples=drop_examples,
             query=query,
         )
-        return self.gen_prompt_data(prompt)
-    def get_gold_answer(self, input_d: dict) -> List[str]:
+    def format_fewshot_template(self, fewshot, sample):
+        drop_examples = DROP_EXAMPLES
+        query = f"Passage: {sample.metadata['passage']}\nQuestion: {sample.input}"
+        return self.prompt_template.format(
+            drop_examples=drop_examples,
+            query=query,
+        )
+    def _get_gold_answers(self, input_d: dict) -> List[str]:
         """
         Parse the raw input labels (gold).
         """
         def _flatten_validated_answers(validated_answers):
-            """Flattens a dict of lists of validated answers.
-            {"number": ['1', '8'], ...}
-            -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
-            """
+            """Flattens a dict of lists of validated answers."""
             valid_answers = []
             for i in range(len(validated_answers['number'])):
                 valid_answers.append({
@@ -96,24 +129,36 @@ class DROPAdapter(DataAdapter):
             answers.append(answer)
         return answers
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+    def extract_answer(self, prediction: str, task_state: TaskState):
         """
-        Parse the predicted result and extract proper answer.
+        Extract the answer from the model prediction.
         """
-        match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', result)
-        extracted_answer = match.group(1) if match else result
+        match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', prediction)
+        extracted_answer = match.group(1) if match else prediction
         return extracted_answer
-    def match(self, gold: List[str], pred: str) -> float:
+    def match_score(
+        self,
+        original_prediction: str,
+        filtered_prediction: str,
+        reference: str,
+        task_state: TaskState,
+    ) -> Score:
         """
-        Match the gold answer and the predicted answer.
+        Calculate accuracy score by matching prediction with reference answers.
         """
         from .utils import _answer_to_bags
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
         max_em = 0
-        for gold_answer in gold:
+        reference = ast.literal_eval(reference) if isinstance(reference, str) else reference
+        for gold_answer in reference:
             # Convert the answers to bags of answers
-            predicted_bags = _answer_to_bags(pred)
+            predicted_bags = _answer_to_bags(filtered_prediction)
             gold_bags = _answer_to_bags(gold_answer)
             if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
@@ -124,7 +169,10 @@ class DROPAdapter(DataAdapter):
             if gold_answer[0].strip():
                 max_em = max(max_em, exact_match)
-        return max_em
+        score.value = {'acc': max_em}
+        score.main_score_name = 'acc'
+        return score
     @staticmethod
     def parse_answer(answer):

evalscope/benchmarks/frames/frames_adapter.py CHANGED Viewed

@@ -1,6 +1,15 @@
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import EvalType, OutputType
-from evalscope.metrics import LLMJudge, exact_match
+import os
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
+logger = get_logger()
 TEMPLATE_0SHOT = """Please read the following text and answer the question below.
@@ -13,52 +22,82 @@ TEMPLATE_0SHOT = """Please read the following text and answer the question below
 Format your response as follows: "Therefore, the answer is (insert answer here)"."""
-@Benchmark.register(
-    name='frames',
-    pretty_name='FRAMES',
-    tags=['Reasoning', 'Long Context'],
-    description=
-    'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.',  # noqa: E501
-    dataset_id='iic/frames',
-    model_adapter=OutputType.GENERATION,
-    output_types=[OutputType.GENERATION],
-    metric_list=['AverageAccuracy'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-    prompt_template=TEMPLATE_0SHOT,
+@register_benchmark(
+    BenchmarkMeta(
+        name='frames',
+        pretty_name='FRAMES',
+        tags=[Tags.REASONING, Tags.LONG_CONTEXT],
+        description=
+        'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.',  # noqa: E501
+        dataset_id='iic/frames',
+        metric_list=['acc'],
+        eval_split='test',
+        prompt_template=TEMPLATE_0SHOT,
+    )
 )
-class FramesAdapter(DataAdapter):
+class FramesAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self._use_llm_judge = True  # Enable LLM judge for FRAMES
+    def load(self):
+        # Try to load dataset from local disk
+        dataset_name_or_path = self.dataset_id
+        if os.path.exists(dataset_name_or_path):
+            logger.info(f'Loading dataset from {dataset_name_or_path}')
+            dataset_path = dataset_name_or_path
+        else:
+            from modelscope import dataset_snapshot_download
-    def load(self, **kwargs):
-        # default load with snapshot
-        kwargs['file_structure'] = {'default': ['test.jsonl']}
-        data_dict = super().load_with_snapshot(**kwargs)
-        return data_dict
+            # Load dataset from remote
+            logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
+            # download dataset snapshot
+            dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern='test.jsonl')
-    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
-        """
-        Generate model prompt from input data.
-        """
-        context = '\n'.join([f"{i['title']}\n{i['text']}" for i in input_d['wiki_items']])
-        question = input_d['Prompt']
-        prompt = self.prompt_template.format(context=context, question=question)
-        return self.gen_prompt_data(prompt)
+        dataset = LocalDataLoader(
+            data_id_or_path=dataset_path,
+            split=self.eval_split,
+            sample_fields=self.record_to_sample,
+            subset='test',
+            limit=self.limit,
+            repeats=self.repeats
+        ).load()
-    def get_gold_answer(self, input_d: dict) -> str:
-        """
-        Parse the raw input labels (gold).
+        test_dataset = DatasetDict({'test': dataset})
+        return test_dataset, None
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
         """
-        return input_d['Answer']
+        Convert a data record to a Sample object.
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        Args:
+            record (Dict[str, Any]): Input data record.
+        Returns:
+            Sample: Sample object with input, target, and metadata.
+        """
+        context = '\n'.join([f"{i['title']}\n{i['text']}" for i in record['wiki_items']])
+        question = record['Prompt']
+        return Sample(
+            input=question, target=record['Answer'], metadata={
+                'context': context,
+                'wiki_items': record['wiki_items']
+            }
+        )
+    def format_prompt_template(self, sample):
+        context = sample.metadata['context']
+        question = sample.input
+        return self.prompt_template.format(context=context, question=question)
+    def extract_answer(self, prediction: str, task_state: TaskState):
         """
-        Parse the predicted result and extract proper answer.
+        Extract the answer from the model prediction.
         """
-        response = result.replace('*', '')
+        response = prediction.replace('*', '')
         if 'the answer is' in response:
             ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
@@ -67,25 +106,69 @@ class FramesAdapter(DataAdapter):
         return ans
-    def match(self, gold: str, pred: str) -> float:
+    def match_score(
+        self,
+        original_prediction: str,
+        filtered_prediction: str,
+        reference: str,
+        task_state: TaskState,
+    ) -> Score:
         """
-        Match the gold answer and the predicted answer.
+        Calculate accuracy score by matching prediction with reference.
         """
+        from evalscope.metrics import exact_match
         from .utils import normalize_answer
-        gold = normalize_answer(gold)
-        pred = normalize_answer(pred)
-        return exact_match(gold=gold, pred=pred)
-    def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        gold = normalize_answer(reference)
+        pred = normalize_answer(filtered_prediction)
+        accuracy = exact_match(gold=gold, pred=pred)
+        score.value = {'acc': accuracy}
+        score.main_score_name = 'acc'
+        return score
+    def llm_match_score(
+        self,
+        original_prediction: str,
+        filtered_prediction: str,
+        reference: str,
+        task_state: TaskState,
+    ) -> Score:
+        """
+        Use LLM judge to evaluate the prediction against the reference.
+        """
         from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
-        raw_input = kwargs.get('raw_input', None)
-        question = raw_input['Prompt']
-        # get grading response
-        prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
-        orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
-        # parse grading response
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        question = task_state.input_text
+        # Get grading response
+        prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
+        orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
+        # Parse grading response
         if 'YES' in orm_response:
-            return 1.0
+            accuracy = 1.0
         else:
-            return 0.0
+            accuracy = 0.0
+        score.value = {'acc': accuracy}
+        score.explanation = f'LLM judge: {orm_response}'
+        score.metadata = {
+            'source': 'llm_judge',
+            'judge_strategy': self.judge_strategy,
+            'model': self.llm_judge.model_id
+        }
+        score.main_score_name = 'acc'
+        return score

evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl