PyPI - evalscope - Versions diffs - 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl - Mend

evalscope 0.16.0py3-none-any.whl → 0.16.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (61) hide show

evalscope/app/__init__.py +28 -0
evalscope/{report → app}/app.py +20 -25
evalscope/app/constants.py +21 -0
evalscope/arguments.py +2 -1
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/utils/embedding.py +75 -35
evalscope/benchmarks/benchmark.py +1 -0
evalscope/benchmarks/data_adapter.py +97 -16
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
evalscope/benchmarks/docmath/utils.py +220 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +90 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +4 -1
evalscope/benchmarks/tool_bench/utils.py +5 -4
evalscope/benchmarks/utils.py +25 -0
evalscope/cli/start_app.py +2 -2
evalscope/collections/__init__.py +35 -3
evalscope/collections/evaluator.py +18 -6
evalscope/config.py +8 -2
evalscope/evaluator/evaluator.py +38 -27
evalscope/metrics/__init__.py +3 -1
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/llm_judge.py +12 -5
evalscope/metrics/math_parser.py +1 -1
evalscope/models/adapters/server_adapter.py +2 -6
evalscope/perf/arguments.py +2 -2
evalscope/perf/benchmark.py +0 -9
evalscope/perf/main.py +7 -0
evalscope/perf/plugin/datasets/custom.py +15 -0
evalscope/perf/utils/benchmark_util.py +1 -1
evalscope/perf/utils/local_server.py +1 -0
evalscope/perf/utils/log_utils.py +12 -5
evalscope/perf/utils/rich_display.py +1 -1
evalscope/report/__init__.py +36 -4
evalscope/report/combinator.py +8 -0
evalscope/report/generator.py +33 -9
evalscope/report/utils.py +60 -3
evalscope/run.py +12 -0
evalscope/utils/logger.py +1 -1
evalscope/utils/utils.py +12 -0
evalscope/version.py +2 -2
{evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/METADATA +13 -11
{evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/RECORD +61 -50
tests/aigc/test_t2i.py +40 -3
tests/cli/test_all.py +39 -35
tests/cli/test_collection.py +7 -6
tests/cli/test_run.py +21 -11
tests/rag/test_mteb.py +5 -5
/evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
{evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
{evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
{evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/data_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
-from evalscope.benchmarks.utils import PromptData, preprocess_decorator
+from evalscope.benchmarks.utils import PromptData, load_file_with_extension, preprocess_decorator
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
 from evalscope.metrics import LLMJudge, metric_registry
 from evalscope.report import Report, ReportGenerator
@@ -15,6 +15,13 @@ logger = get_logger()
 class DataAdapter(ABC):
+    """
+    Data Adapter for the benchmark. You need to implement the following methods:
+        - gen_prompt
+        - get_gold_answer
+        - parse_pred_result
+        - match
+    """
     def __init__(self,
                  name: str,
@@ -31,30 +38,36 @@ class DataAdapter(ABC):
                  system_prompt: Optional[str] = None,
                  query_template: Optional[str] = None,
                  pretty_name: Optional[str] = None,
+                 description: Optional[str] = None,
                  **kwargs):
         """
-        Data Adapter for the benchmark. You need to implement the following methods:
-            - gen_prompt
-            - get_gold_answer
-            - parse_pred_result
-            - match
         Args:
             name: str, the name of the benchmark.
             dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
+            model_adapter: str, the model adapter to use for the benchmark.
             subset_list: list of subset names for the dataset.
             metric_list: list, the metric list to evaluate the model on specific benchmark.
+            llm_as_a_judge: bool, whether to use LLM as a judge to evaluate the predicted answer against the gold answer.
+            output_types: list, the output types of the model adapter. Default: [model_adapter]
             few_shot_num: int, number of few-shot examples. Default: 0
             train_split: str, usually for few-shot examples. e.g. 'train'
             eval_split: str, the target eval split name. e.g. 'test'
             prompt_template: str, the prompt template for the benchmark,
                 e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
                     the form of A or B or C or D, do not output explanation:`
-        """
+            system_prompt: str, the system prompt for the benchmark, e.g. 'You are a helpful assistant.'
+            query_template: str, the query template for the benchmark, e.g. 'Please answer the following question: {}'
+            pretty_name: str, the pretty name of the benchmark, e.g. 'ARC Challenge Set'.
+            description: str, the description of the benchmark,
+                e.g. 'ARC Challenge Set is a benchmark for evaluating reasoning abilities of models on science questions.'
+        """  # noqa: E501
         self.name = name
         self.dataset_id = dataset_id
         self.model_adapter = model_adapter
         self.subset_list = subset_list
         self.metric_list = metric_list
+        self.llm_as_a_judge = llm_as_a_judge
+        self.output_types = output_types or [model_adapter]
         self.few_shot_num = few_shot_num
         self.train_split = train_split
         self.eval_split = eval_split
@@ -62,9 +75,8 @@ class DataAdapter(ABC):
         self.system_prompt = system_prompt
         self.query_template = query_template
         self.pretty_name = pretty_name
+        self.description = description
         self.config_kwargs = kwargs
-        self.output_types = output_types or [model_adapter]
-        self.llm_as_a_judge = llm_as_a_judge
         self.category_map = kwargs.get('category_map', {})
         self.choices = kwargs.get('choices', None)
@@ -156,6 +168,49 @@ class DataAdapter(ABC):
         """
         return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
+    def load_with_snapshot(self,
+                           file_structure: Dict[str, List[str]],
+                           dataset_name_or_path: str = None,
+                           subset_list: list = None,
+                           work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
+                           **kwargs) -> dict:
+        """
+        For datasets that cannot be correctly loaded using MsDataset, utilize snapshot downloading to load the data.
+        This feature supports both remote and local datasets.
+        Args:
+            file_structure: dict, the file structure of the dataset, e.g. {'subset_name': ['file1.jsonl', 'file2.jsonl']}.
+            dataset_name_or_path: str, the dataset id on ModelScope or local path for the benchmark.
+            subset_list: list of subset names for the dataset.
+            work_dir: str, the working directory to store the dataset.
+        Returns: {'subset_name': {'eval': eval_dataset}}
+        """  # noqa: E501
+        dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
+        subset_list = subset_list or self.subset_list
+        # Try to load dataset from local disk
+        if os.path.exists(dataset_name_or_path):
+            logger.info(f'Loading dataset from {dataset_name_or_path}')
+            dataset_path = dataset_name_or_path
+        else:
+            from modelscope import dataset_snapshot_download
+            # Load dataset from remote
+            logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
+            # flatten file structure
+            file_names = [file for sub_files in file_structure.values() for file in sub_files]
+            # download dataset snapshot
+            dataset_path = dataset_snapshot_download(
+                dataset_name_or_path, cache_dir=work_dir, allow_file_pattern=file_names)
+        # read and process files
+        data_dict = defaultdict(dict)
+        for sub_name in subset_list:
+            file_paths = [os.path.join(dataset_path, file_name) for file_name in file_structure[sub_name]]
+            # not train split, only eval split
+            data_dict[sub_name][self.eval_split] = load_file_with_extension(file_paths)
+        return data_dict
     def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
         """
         Reformat the dataset subset with subset_key and format.
@@ -249,7 +304,7 @@ class DataAdapter(ABC):
     def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
                             **kwargs) -> Dict[str, List[float]]:
         """
-        compute weighted mean of the bleu score of all samples
+        compute weighted mean of score of all samples
         Args:
             review_res_list: [score1, score2, ...]
@@ -270,7 +325,7 @@ class DataAdapter(ABC):
                 items['AverageAccuracy'].append(scores)
         return items
-    def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
+    def gen_report(self, subset_score_map: dict, model_name: str, **kwargs) -> Report:
         """
         Generate report for the evaluation results for all subsets.
@@ -278,7 +333,7 @@ class DataAdapter(ABC):
             subset_score_map: The subset-score map.
                 e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
-            report_name: str, the user-defined report name. Default: None
+            model_name: The evaluation model name.
         Returns: The evaluation report.
@@ -312,9 +367,17 @@ class DataAdapter(ABC):
             "model_name": "qwen2.5"
         }
         """  # noqa: E501
-        kwargs['category_map'] = self.category_map
-        kwargs['metric_list'] = self.metric_list
-        return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
+        return ReportGenerator.gen_report(subset_score_map, model_name, data_adapter=self, **kwargs)
+    def post_process_report(self, report: Report, **kwargs):
+        """
+        Post-process the report after generation. Draw a chart, save to file, etc.
+        This method can be overridden to customize the report format or content.
+        Args:
+            report (Report): The generated report.
+        """
+        pass
     def gen_prompt_data(self,
                         prompt: str,
@@ -324,6 +387,23 @@ class DataAdapter(ABC):
                         id: Optional[Union[int, str]] = None,
                         messages: Optional[List[dict]] = None,
                         **kwargs) -> dict:
+        """
+        Generates a dictionary representation of prompt data for evaluation or inference.
+        Args:
+            prompt (str): The main prompt or input text. Can also be a list of prompts.
+            system_prompt (Optional[str], optional): An optional system-level prompt to provide context or instructions. Defaults to None.
+            choices (Optional[List[str]], optional): A list of possible choices for multi-choice tasks.
+                If not provided, uses self.choices. Defaults to None.
+            index (Optional[Union[int, str]], optional): An optional index or identifier for the prompt.
+                Defaults to 0 if not provided. Defaults to None.
+            id (Optional[Union[int, str]], optional): An optional unique identifier for the prompt data. Defaults to None.
+            messages (Optional[List[dict]], optional): An optional list of message dictionaries, typically for chat-based prompts. Defaults to None.
+                If messages is provided, it will be used as the prompt data instead of the prompt string.
+        Returns:
+            dict: A dictionary representation of the prompt data, suitable for further processing or model input.
+        """  # noqa: E501
         data = [prompt] if not isinstance(prompt, list) else prompt
         prompt_data = PromptData(
             data=data,
@@ -416,7 +496,8 @@ class DataAdapter(ABC):
         # Extract question from raw_input if available
         raw_input = kwargs.get('raw_input', {})
-        question_keys = ['question', 'prompt', 'query', 'problem']
+        question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
+        # Find the first non-empty question key in raw_input
         question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
         # Request judge and obtain score

evalscope/benchmarks/docmath/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/docmath/docmath_adapter.py ADDED Viewed

@@ -0,0 +1,84 @@
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import LLMJudge
+TEMPLATE_0SHOT = """Please read the following text and answer the question below.
+<text>
+{context}
+</text>
+{question}
+Format your response as follows: "Therefore, the answer is (insert answer here)"."""
+@Benchmark.register(
+    name='docmath',
+    pretty_name='DocMath',
+    description=
+    'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.',  # noqa: E501
+    dataset_id='yale-nlp/DocMath-Eval',
+    metric_list=['AverageAccuracy'],
+    subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template=TEMPLATE_0SHOT,
+)
+class DocMathAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def load(self, **kwargs):
+        # default load mini test
+        kwargs['split_as_subset'] = True
+        data_dict = super().load(**kwargs)
+        return data_dict
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate model prompt from input data.
+        """
+        context = context = '\n'.join(input_d['paragraphs'])
+        question = input_d['question']
+        prompt = self.prompt_template.format(context=context, question=question)
+        return self.gen_prompt_data(prompt)
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        """
+        return input_d['ground_truth']
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        """
+        from .utils import extract_answer
+        extracted_answer = extract_answer(result)
+        return extracted_answer
+    def match(self, gold: str, pred: str) -> float:
+        """
+        Match the gold answer and the predicted answer.
+        """
+        from .utils import get_acc
+        return get_acc(prediction=pred, gt=gold)
+    def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
+        from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
+        raw_input = kwargs.get('raw_input', None)
+        question = raw_input['question']
+        # get grading response
+        prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
+        orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
+        # parse grading response
+        if 'YES' in orm_response:
+            return 1.0
+        else:
+            return 0.0

evalscope/benchmarks/docmath/utils.py ADDED Viewed

@@ -0,0 +1,220 @@
+import math
+import numpy as np
+import re
+from sympy import Rational
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
+Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
+Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
+Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
+Your output must follow the following format:
+1) Provide an explanation for why the answers are equivalent or not.
+2) Then provide your final answer in the form of: [[YES]] or [[NO]]
+"""  # noqa: E501
+ORM_USER_TEMPLATE = """
+Problem: {problem}
+Answer 1: {answer_1}
+Answer 2: {answer_2}
+"""
+def round_up_to_decimal(number, decimals):
+    factor = 10**decimals
+    return math.ceil(number * factor) / factor
+def is_number(string):
+    pattern = r'^[-+]?(\d{1,3}(,\d{3})*|(\d+))(\.\d+)?$'
+    match = re.match(pattern, string)
+    return bool(match)
+def is_scientific_number(string):
+    pattern = r'^[-+]?\d+(\.\d+)?e[-]?\d+$'
+    match = re.match(pattern, string)
+    return bool(match)
+def normalize(prediction: str):
+    # Preprocessing the string [Stage 1]
+    prediction = prediction.strip()
+    prediction = prediction.rstrip('.')
+    if not isinstance(prediction, str):
+        prediction = str(prediction) if prediction is not None else '0'
+    for money in ['£', '€', '¥', 'million', 'billion', 'thousand', 'US', 'USD', 'RMB']:
+        prediction = prediction.replace(money, '')
+    # Replace special tokens
+    if '=' in prediction:
+        prediction = prediction.split('=')[-1].strip()
+    if '≈' in prediction:
+        prediction = prediction.split('≈')[-1].strip()
+    if '`' in prediction:
+        prediction = prediction.replace('`', '')
+    if '%' in prediction:
+        prediction = prediction.replace('%', '')
+    if '$' in prediction:
+        prediction = prediction.replace('$', '')
+    if '°' in prediction:
+        prediction = prediction.replace('°', '')
+    # Detect the boolean keyword in the generation
+    if prediction in ['true', 'yes', 'false', 'no']:
+        if prediction == 'true' or prediction == 'yes':
+            prediction = 'True'
+        else:
+            prediction = 'False'
+    if 'True' in prediction or 'False' in prediction:
+        prediction = 'True' if 'True' in prediction else 'False'
+    # Detect the approximation keyword
+    if 'approximately' in prediction:
+        prediction = prediction.replace('approximately', '').strip()
+    if ' or ' in prediction:
+        prediction = prediction.split(' or ')[0]
+    # Drop the units before and after the number
+    if re.match(r'[-+]?(?:[\d,]*\.*\d+) [^0-9 ]+$', prediction):
+        prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+)) [^0-9 ]+$', prediction).group(1)
+    if re.match(r'[^0-9 ]+ [-+]?(?:[\d,]*\.*\d+)$', prediction):
+        prediction = re.search(r'[^0-9 ]+ ([-+]?(?:[\d,]*\.*\d+))$', prediction).group(1)
+    if re.match(r'[-+]?(?:[\d,]*\.*\d+)[^\d]{1,2}$', prediction):
+        prediction = re.search(r'([-+]?(?:[\d,]*\.*\d+))[^\d]{1,2}$', prediction).group(1)
+    if re.match(r'[^-+\d]{1,2}(?:[\d,]*\.*\d+)$', prediction):
+        prediction = re.search(r'[^-+\d]{1,2}((?:[\d,]*\.*\d+))$', prediction).group(1)
+    # Preprocessing the number [Stage 1]
+    if '10^' in prediction:
+        prediction = re.sub(r'10\^(-?\d+)', r'math.pow(10, \1)', prediction)
+    if ' x ' in prediction:
+        prediction = prediction.replace(' x ', '*')
+    if ' × ' in prediction:
+        prediction = prediction.replace(' × ', '*')
+    if is_number(prediction):
+        prediction = prediction.replace(',', '')
+    # Preprocessing the option [Stage 3]
+    if '(a)' in prediction or '(b)' in prediction or '(c)' in prediction or '(d)' in prediction:
+        prediction = '"' + re.search(r'\([a-d]\)', prediction).group(0) + '"'
+    # If the prediction is empty, use dummy '0'
+    if not prediction:
+        prediction = '0'
+    # Converting the string answer to a number/list/bool/option
+    try:
+        prediction = eval(prediction)
+    except Exception:
+        # TO CHECK
+        prediction = 0
+    # Performing common type conversion
+    if isinstance(prediction, (set, tuple)):
+        prediction = list(prediction)
+        if isinstance(prediction[0], complex):
+            prediction = [tmp.real for tmp in prediction]
+        elif isinstance(prediction[0], Rational):
+            prediction = [float(tmp) for tmp in prediction]
+    elif isinstance(prediction, np.ndarray):
+        prediction = prediction.tolist()
+    else:
+        if isinstance(prediction, complex):
+            prediction = prediction.real
+        elif isinstance(prediction, Rational):
+            prediction = float(prediction)
+    return prediction
+def extract_answer(response: str):
+    """Parses the final answer from the model's response text.
+    Args:
+        response: Text extracted from the model's response
+    Returns:
+        The final answer as a numeric value (string), or None if not found
+    """
+    # Remove any asterisks or other unwanted characters
+    response = response.replace('*', '')
+    response = response.replace('(', '')
+    response = response.replace(')', '')
+    # Search for the pattern 'the answer is {final answer}.'
+    match = re.search(r'the answer is (\=?\≈?\`?\%?\$?\°?\£?\€?\¥?-?[0-9\.,]+)', response, re.IGNORECASE)
+    if match:
+        # Remove commas from the matched number (if any)
+        res = match.group(1).replace(',', '').rstrip('.')
+        return res
+    else:
+        return response
+def within_eps(pred: float, gt: float):
+    eps = abs(gt) * 0.0015
+    if pred >= gt - eps and pred <= gt + eps:
+        return True
+    else:
+        return False
+def compare_two_numbers(p, gt):
+    if isinstance(p, int) or isinstance(p, float):
+        pass
+    elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
+        return False
+    elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
+        return False
+    else:
+        raise ValueError(p)
+    v1, v2 = max(abs(gt), abs(p)), min(abs(gt), abs(p))
+    if (v1 != 0 and v2 != 0) and int(math.log10(v1 / v2)) == math.log10(v1 / v2):
+        return True
+    if v2 <= v1 / 50 and within_eps(pred=v2 * 100, gt=v1):
+        return True
+    elif v2 <= v1 / 500 and within_eps(pred=v2 * 1000, gt=v1):
+        return True
+    elif v2 <= v1 / 50000 and within_eps(pred=v2 * 100000, gt=v1):
+        return True
+    if round_up_to_decimal(v1, 2) == round_up_to_decimal(v2, 2):
+        return True
+    return within_eps(pred=p, gt=gt)
+def get_acc(prediction, gt, cot=True):
+    try:
+        if cot:
+            prediction = normalize(prediction)
+        else:
+            prediction = float(prediction)
+        answer_type = type(gt).__name__
+        assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
+        if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
+            # Comparing prediction against the reference
+            if answer_type in ['bool']:
+                acc = int(prediction == gt)
+            elif answer_type == 'int':
+                acc = int(compare_two_numbers(prediction, gt))
+            elif answer_type == 'float' or answer_type == 'float64':
+                acc = int(compare_two_numbers(prediction, gt))
+            else:
+                acc = 0
+        else:
+            acc = 0
+            logger.error('Error: ', prediction, type(prediction))
+        return acc
+    except Exception:
+        return 0

evalscope/benchmarks/frames/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/frames/frames_adapter.py ADDED Viewed

@@ -0,0 +1,90 @@
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType, OutputType
+from evalscope.metrics import LLMJudge, exact_match
+TEMPLATE_0SHOT = """Please read the following text and answer the question below.
+<text>
+{context}
+</text>
+{question}
+Format your response as follows: "Therefore, the answer is (insert answer here)"."""
+@Benchmark.register(
+    name='frames',
+    pretty_name='FRAMES',
+    description=
+    'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.',  # noqa: E501
+    dataset_id='iic/frames',
+    model_adapter=OutputType.GENERATION,
+    output_types=[OutputType.GENERATION],
+    metric_list=['AverageAccuracy'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template=TEMPLATE_0SHOT,
+)
+class FramesAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def load(self, **kwargs):
+        # default load with snapshot
+        kwargs['file_structure'] = {'default': ['test.jsonl']}
+        data_dict = super().load_with_snapshot(**kwargs)
+        return data_dict
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate model prompt from input data.
+        """
+        context = '\n'.join([f"{i['title']}\n{i['text']}" for i in input_d['wiki_items']])
+        question = input_d['Prompt']
+        prompt = self.prompt_template.format(context=context, question=question)
+        return self.gen_prompt_data(prompt)
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        """
+        return input_d['Answer']
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        """
+        response = result.replace('*', '')
+        if 'the answer is' in response:
+            ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
+        else:
+            ans = ''
+        return ans
+    def match(self, gold: str, pred: str) -> float:
+        """
+        Match the gold answer and the predicted answer.
+        """
+        from .utils import normalize_answer
+        gold = normalize_answer(gold)
+        pred = normalize_answer(pred)
+        return exact_match(gold=gold, pred=pred)
+    def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
+        from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
+        raw_input = kwargs.get('raw_input', None)
+        question = raw_input['Prompt']
+        # get grading response
+        prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
+        orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
+        # parse grading response
+        if 'YES' in orm_response:
+            return 1.0
+        else:
+            return 0.0

evalscope/benchmarks/frames/utils.py ADDED Viewed

@@ -0,0 +1,37 @@
+import re
+import string
+def normalize_answer(s):
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+    def white_space_fix(text):
+        return ' '.join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+GENERAL_ORM_PROMPT = """You are an expert in verifying if two answers are the same.
+Your input is a problem and two answers, Answer 1 and Answer 2. You need to check if they are equivalent.
+Your task is to determine if two answers are equivalent, without attempting to solve the original problem.
+Compare the answers to verify they represent identical values or meaning, even when written in different forms or notations.
+Your output must follow the following format:
+1) Provide an explanation for why the answers are equivalent or not.
+2) Then provide your final answer in the form of: [[YES]] or [[NO]]
+"""  # noqa: E501
+ORM_USER_TEMPLATE = """
+Problem: {problem}
+Answer 1: {answer_1}
+Answer 2: {answer_2}
+"""

evalscope/benchmarks/needle_haystack/__init__.py ADDED Viewed

File without changes

evalscope 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.0py3-none-any.whl → 0.16.1py3-none-any.whl