PyPI - evalscope - Versions diffs - 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl - Mend

evalscope 0.16.1py3-none-any.whl → 0.16.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show

evalscope/app/app.py +20 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
evalscope/backend/rag_eval/utils/embedding.py +2 -4
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
evalscope/benchmarks/aime/aime24_adapter.py +3 -1
evalscope/benchmarks/aime/aime25_adapter.py +3 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
evalscope/benchmarks/arc/arc_adapter.py +3 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
evalscope/benchmarks/benchmark.py +1 -0
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
evalscope/benchmarks/data_adapter.py +2 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
evalscope/benchmarks/drop/drop_adapter.py +3 -0
evalscope/benchmarks/frames/frames_adapter.py +1 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
evalscope/benchmarks/musr/musr_adapter.py +3 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
evalscope/benchmarks/needle_haystack/utils.py +2 -2
evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
evalscope/benchmarks/race/race_adapter.py +3 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
evalscope/collections/evaluator.py +50 -28
evalscope/constants.py +1 -1
evalscope/evaluator/evaluator.py +6 -5
evalscope/metrics/t2v_metrics/__init__.py +9 -23
evalscope/models/adapters/__init__.py +2 -0
evalscope/models/adapters/base_adapter.py +31 -27
evalscope/models/adapters/bfcl_adapter.py +244 -0
evalscope/models/adapters/server_adapter.py +78 -17
evalscope/models/custom/custom_model.py +0 -3
evalscope/models/custom/dummy_model.py +77 -39
evalscope/models/local_model.py +1 -1
evalscope/models/register.py +2 -1
evalscope/perf/arguments.py +2 -0
evalscope/perf/benchmark.py +16 -3
evalscope/perf/plugin/api/openai_api.py +2 -0
evalscope/report/combinator.py +38 -12
evalscope/report/utils.py +24 -1
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -1
evalscope/utils/io_utils.py +59 -2
evalscope/version.py +2 -2
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
tests/aigc/test_t2i.py +8 -8
tests/cli/test_all.py +40 -33
tests/cli/test_collection.py +4 -3
tests/cli/test_run.py +36 -21
tests/rag/test_clip_benchmark.py +5 -1
tests/rag/test_mteb.py +46 -2
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0

evalscope/models/custom/dummy_model.py CHANGED Viewed

@@ -1,61 +1,99 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import time
 from typing import List
+from evalscope.models import CustomModel
 from evalscope.utils.logger import get_logger
-from .custom_model import CustomModel
 logger = get_logger()
-"""
-This script is used to rewrite the evaluation results without re-running the model predictions.
-"""
 class DummyCustomModel(CustomModel):
-    def __init__(self, config: dict = {'model_id': 'dummy-model'}, **kwargs):
+    def __init__(self, config: dict = {}, **kwargs):
         super(DummyCustomModel, self).__init__(config=config, **kwargs)
-    def predict(self, prompts: List[dict], **kwargs):
-        # ONLY FOR DUMMY IMPLEMENTATION, DO NOT EDIT OR USE IN PRODUCTION.
+    def make_request_messages(self, input_item: dict) -> list:
+        """
+        Make request messages for OpenAI API.
+        """
+        if input_item.get('messages', None):
+            return input_item['messages']
-        response = ''
+        data: list = input_item['data']
+        if isinstance(data[0], tuple):  # for truthful_qa and hellaswag
+            query = '\n'.join(''.join(item) for item in data)
+            system_prompt = input_item.get('system_prompt', None)
+        else:
+            query = data[0]
+            system_prompt = input_item.get('system_prompt', None)
-        res_d: dict = {
-            'choices': [{
-                'index': 0,
-                'message': {
-                    'content': response,
-                    'role': 'assistant'
-                }
-            }],
-            'created': time.time(),
-            'model': self.config.get('model_id'),  # should be model_id
-            'object': 'chat.completion',
-            'usage': {
-                'completion_tokens': 0,
-                'prompt_tokens': 0,
-                'total_tokens': 0
-            }
-        }
+        messages = []
+        if system_prompt:
+            messages.append({'role': 'system', 'content': system_prompt})
-        return [res_d for _ in prompts]
+        messages.append({'role': 'user', 'content': query})
+        return messages
-if __name__ == '__main__':
-    from evalscope.run import run_task
-    from evalscope.utils.io_utils import yaml_to_dict
-    # step1: 如果outputs做了迁移，需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
-    # step2: 执行此脚本，默认使用use_cache=True，实现免推理对eval结果进行刷新
+    def predict(self, prompts: List[dict], **kwargs):
+        original_inputs = kwargs.get('origin_inputs', None)
+        infer_cfg = kwargs.get('infer_cfg', None)
+        logger.debug(f'** Prompts: {prompts}')
+        if original_inputs is not None:
+            logger.debug(f'** Original inputs: {original_inputs}')
+        if infer_cfg is not None:
+            logger.debug(f'** Inference config: {infer_cfg}')
+        # Simulate a response based on the prompts
+        # Must return a list of dicts with the same format as the OpenAI API.
+        responses = []
+        for input_item in original_inputs:
+            message = self.make_request_messages(input_item)
+            response = f'Dummy response for prompt: {message}'
+            res_d = {
+                'choices': [{
+                    'index': 0,
+                    'message': {
+                        'content': response,
+                        'role': 'assistant'
+                    }
+                }],
+                'created': time.time(),
+                'model': self.config.get('model_id'),
+                'object': 'chat.completion',
+                'usage': {
+                    'completion_tokens': 0,
+                    'prompt_tokens': 0,
+                    'total_tokens': 0
+                }
+            }
-    swift_model = DummyCustomModel(config={'model_id': 'swift-model-dummy'})
+            responses.append(res_d)
-    task_cfg_file = '/path/to/eval_your_model_results/configs/task_output_config.yaml'
+        return responses
-    task_cfg_d = yaml_to_dict(task_cfg_file)
-    task_cfg_d.update({'model': swift_model})
-    eval_results: dict = run_task(task_cfg=task_cfg_d)
-    print('** Evaluation results finished !\n')
+if __name__ == '__main__':
+    from evalscope import TaskConfig, run_task
+    dummy_model = DummyCustomModel()
+    task_config = TaskConfig(
+        model=dummy_model,
+        model_id='evalscope-model-dummy',
+        datasets=['gsm8k'],
+        eval_type='custom',  # must be custom for custom model evaluation
+        generation_config={
+            'max_new_tokens': 100,
+            'temperature': 0.0,
+            'top_p': 1.0,
+            'top_k': 50,
+            'repetition_penalty': 1.0
+        },
+        debug=True,
+        limit=5,
+    )
+    eval_results = run_task(task_cfg=task_config)

evalscope/models/local_model.py CHANGED Viewed

@@ -82,7 +82,7 @@ class LocalImageModel(LocalModel):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.pipeline_cls = kwargs.pop('pipeline_cls', None)
+        self.pipeline_cls = self.kwargs.pop('pipeline_cls', None)
         # default to DiffusionPipeline if not specified
         if self.pipeline_cls is None:
             if 'flux' in self.model_id.lower():

evalscope/models/register.py CHANGED Viewed

@@ -47,8 +47,9 @@ def register_model_adapter_class(cls, name=None):
 # register all model adapters
 register_model_adapter_class(BaseModelAdapter, name='base')
 register_model_adapter_class(ChatGenerationModelAdapter, name=OutputType.GENERATION)
-register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.LOGITS)
+register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.CONTINUOUS)
 register_model_adapter_class(MultiChoiceModelAdapter, name=OutputType.MULTIPLE_CHOICE)
 register_model_adapter_class(CustomModelAdapter, name='custom')
 register_model_adapter_class(ServerModelAdapter, name='server')
+register_model_adapter_class(BFCLAdapter, name='bfcl_server')
 register_model_adapter_class(T2IModelAdapter, name=OutputType.IMAGE_GENERATION)

evalscope/perf/arguments.py CHANGED Viewed

@@ -55,6 +55,7 @@ class Arguments:
     # Response settings
     frequency_penalty: Optional[float] = None  # Frequency penalty for the response
+    repetition_penalty: Optional[float] = None  # Repetition penalty for the response
     logprobs: Optional[bool] = None  # Whether to log probabilities
     max_tokens: Optional[int] = 2048  # Maximum number of tokens in the response
     min_tokens: Optional[int] = None  # Minimum number of tokens in the response
@@ -181,6 +182,7 @@ def add_argument(parser: argparse.ArgumentParser):
     # Response settings
     parser.add_argument('--frequency-penalty', type=float, help='The frequency_penalty value', default=None)
+    parser.add_argument('--repetition-penalty', type=float, help='The repetition_penalty value', default=None)
     parser.add_argument('--logprobs', action='store_true', help='The logprobs', default=None)
     parser.add_argument(
         '--max-tokens', type=int, help='The maximum number of tokens that can be generated', default=2048)

evalscope/perf/benchmark.py CHANGED Viewed

@@ -41,14 +41,27 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
         message_generator_class = DatasetRegistry(args.dataset)
         message_generator = message_generator_class(args)
+        dataset_messages = []
+        try:
+            for messages in message_generator:
+                dataset_messages.append(messages)
+        except StopIteration:
+            pass
+        if not dataset_messages:
+            raise Exception('Dataset is empty!')
         count = 0
-        for messages in message_generator:
+        dataset_index = 0
+        while count < args.number:
+            messages = dataset_messages[dataset_index]
             request = query_generator.build_request(messages, args)
             if request is not None:
                 yield request
                 count += 1
-                if args.number and count >= args.number:
-                    break
+            dataset_index = (dataset_index + 1) % len(dataset_messages)
     if args.prompt:
         prompt = load_prompt(args.prompt)

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -75,6 +75,8 @@ class OpenaiPlugin(ApiPluginBase):
             payload['min_tokens'] = param.min_tokens
         if param.frequency_penalty is not None:
             payload['frequency_penalty'] = param.frequency_penalty
+        if param.repetition_penalty is not None:
+            payload['repetition_penalty'] = param.repetition_penalty
         if param.logprobs is not None:
             payload['logprobs'] = param.logprobs
         if param.n_choices is not None:

evalscope/report/combinator.py CHANGED Viewed

@@ -34,25 +34,51 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
 def get_data_frame(report_list: List[Report],
                    flatten_metrics: bool = True,
-                   flatten_categories: bool = True) -> pd.DataFrame:
+                   flatten_categories: bool = True,
+                   add_overall_metric: bool = False) -> pd.DataFrame:
     tables = []
     for report in report_list:
-        df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
+        df = report.to_dataframe(
+            flatten_metrics=flatten_metrics,
+            flatten_categories=flatten_categories,
+            add_overall_metric=add_overall_metric)
         tables.append(df)
     return pd.concat(tables, ignore_index=True)
-def gen_table(reports_path_list: list) -> str:
-    report_list = get_report_list(reports_path_list)
-    table = get_data_frame(report_list)
-    return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
-def gen_report_table(report: Report) -> str:
+def gen_table(reports_path_list: list[str] = None,
+              report_list: list[Report] = None,
+              flatten_metrics: bool = True,
+              flatten_categories: bool = True,
+              add_overall_metric: bool = False) -> str:
     """
-    Generate a report table for a single report.
+    Generates a formatted table from a list of report paths or Report objects.
+    Args:
+        reports_path_list (list[str], optional): List of file paths to report files.
+            Either this or `report_list` must be provided.
+        report_list (list[Report], optional): List of Report objects.
+            Either this or `reports_path_list` must be provided.
+        flatten_metrics (bool, optional): Whether to flatten the metrics in the output table. Defaults to True.
+        flatten_categories (bool, optional): Whether to flatten the categories in the output table. Defaults to True.
+        add_overall_metric (bool, optional): Whether to add an overall metric column to the table. Defaults to False.
+    Returns:
+        str: A string representation of the table in grid format.
+    Raises:
+        AssertionError: If neither `reports_path_list` nor `report_list` is provided.
     """
-    table = report.to_dataframe(flatten_metrics=True, flatten_categories=True)
+    assert (reports_path_list is not None) or (report_list is not None), \
+        'Either reports_path_list or report_list must be provided.'
+    if report_list is None:
+        report_list = get_report_list(reports_path_list)
+    # Generate a DataFrame from the report list
+    table = get_data_frame(
+        report_list,
+        flatten_metrics=flatten_metrics,
+        flatten_categories=flatten_categories,
+        add_overall_metric=add_overall_metric)
     return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
@@ -68,7 +94,7 @@ if __name__ == '__main__':
     report_dir_1 = './outputs/20250117_151926'
     # report_dir_2 = './outputs/20250107_204445/reports'
-    report_table = gen_table([report_dir_1])
+    report_table = gen_table(reports_path_list=[report_dir_1])
     print(report_table)
     # ALL VALUES ONLY FOR EXAMPLE

evalscope/report/utils.py CHANGED Viewed

@@ -133,11 +133,25 @@ class Report:
             data = json.load(f)
         return cls.from_dict(data)
-    def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
+    def to_dataframe(self,
+                     flatten_metrics: bool = True,
+                     flatten_categories: bool = True,
+                     add_overall_metric: bool = False) -> pd.DataFrame:
+        """
+        Convert the report to a pandas DataFrame.
+        Args:
+            flatten_metrics (bool): Whether to flatten the metrics to a single row.
+            flatten_categories (bool): Whether to flatten the categories to multiple rows.
+            add_overall_metric (bool): Whether to add an overall metric row.
+        Returns:
+            pd.DataFrame: The report as a pandas DataFrame.
+        """
         table = defaultdict(list)
         for metric in self.metrics:
+            metric_count = 0
             for category in metric.categories:
                 for subset in category.subsets:
+                    metric_count += 1
                     table[ReportKey.model_name].append(self.model_name)
                     table[ReportKey.dataset_name].append(self.dataset_name)
                     table[ReportKey.metric_name].append(metric.name)
@@ -145,6 +159,15 @@ class Report:
                     table[ReportKey.subset_name].append(subset.name)
                     table[ReportKey.num].append(subset.num)
                     table[ReportKey.score].append(subset.score)
+            # add overall metric when there are multiple subsets
+            if metric_count > 1 and add_overall_metric:
+                table[ReportKey.model_name].append(self.model_name)
+                table[ReportKey.dataset_name].append(self.dataset_name)
+                table[ReportKey.metric_name].append(metric.name)
+                table[ReportKey.category_name].append(('-', ))
+                table[ReportKey.subset_name].append('OVERALL')
+                table[ReportKey.num].append(metric.num)
+                table[ReportKey.score].append(metric.score)
             # NOTE: only flatten metrics if needed, use the first metric by default
             if not flatten_metrics:
                 break

evalscope/run.py CHANGED Viewed

@@ -133,7 +133,7 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
     # Make overall report
     try:
-        report_table: str = gen_table([outputs.reports_dir])
+        report_table: str = gen_table(reports_path_list=[outputs.reports_dir], add_overall_metric=True)
         logger.info(f'Overall report table: \n{report_table} \n')
     except Exception:
         logger.error('Failed to generate report table.')

evalscope/summarizer.py CHANGED Viewed

@@ -30,7 +30,7 @@ class Summarizer:
             with open(report_file, 'r') as f:
                 res_list.append(json.load(f))
-        report_table: str = gen_table([reports_dir])
+        report_table: str = gen_table(reports_path_list=[reports_dir])
         logger.info(f'*** Report table ***\n{report_table}')
         return res_list

evalscope/utils/io_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import csv
 import json
 import jsonlines as jsonl
 import os
@@ -112,8 +113,58 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
         writer.write_all(data_list)
-def jsonl_to_csv():
-    pass
+def jsonl_to_csv(jsonl_file, csv_file):
+    """
+    Convert jsonl file to csv file.
+    Args:
+        jsonl_file: jsonl file path.
+        csv_file: csv file path.
+    """
+    data = jsonl_to_list(jsonl_file)
+    if not data:
+        logger.warning(f'No data found in {jsonl_file}.')
+        return
+    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(data[0].keys())  # Write header
+        for item in data:
+            writer.writerow(item.values())
+def csv_to_list(csv_file) -> list:
+    """
+    Read csv file to list.
+    Args:
+        csv_file: csv file path.
+    Returns:
+        list: list of lines. Each line is a dict.
+    """
+    res_list = []
+    with open(csv_file, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            res_list.append(row)
+    return res_list
+def csv_to_jsonl(csv_file, jsonl_file):
+    """
+    Convert csv file to jsonl file.
+    Args:
+        csv_file: csv file path.
+        jsonl_file: jsonl file path.
+    """
+    data = csv_to_list(csv_file)
+    if not data:
+        logger.warning(f'No data found in {csv_file}.')
+        return
+    dump_jsonl_data(data, jsonl_file, dump_mode=DumpMode.OVERWRITE)
 def yaml_to_dict(yaml_file) -> dict:
@@ -168,3 +219,9 @@ def dict_to_json(d: dict, json_file: str):
     """
     with open(json_file, 'w') as f:
         json.dump(d, f, indent=4, ensure_ascii=False)
+if __name__ == '__main__':
+    csv_file = 'custom_eval/text/mcq/example_val.csv'
+    jsonl_file = 'custom_eval/text/mcq/example_val.jsonl'
+    csv_to_jsonl(csv_file, jsonl_file)

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.16.1'
-__release_datetime__ = '2025-06-03 20:00:00'
+__version__ = '0.16.2'
+__release_datetime__ = '2025-06-23 20:00:00'

{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.16.1
+Version: 0.16.2
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -40,7 +40,6 @@ Requires-Dist: seaborn
 Requires-Dist: sympy
 Requires-Dist: tabulate
 Requires-Dist: torch
-Requires-Dist: torchvision
 Requires-Dist: tqdm
 Requires-Dist: transformers>=4.33
 Requires-Dist: word2number
@@ -50,6 +49,7 @@ Requires-Dist: iopath; extra == "aigc"
 Requires-Dist: omegaconf; extra == "aigc"
 Requires-Dist: open-clip-torch; extra == "aigc"
 Requires-Dist: opencv-python; extra == "aigc"
+Requires-Dist: torchvision; extra == "aigc"
 Provides-Extra: all
 Requires-Dist: accelerate; extra == "all"
 Requires-Dist: datasets>=3.0; extra == "all"
@@ -75,7 +75,6 @@ Requires-Dist: seaborn; extra == "all"
 Requires-Dist: sympy; extra == "all"
 Requires-Dist: tabulate; extra == "all"
 Requires-Dist: torch; extra == "all"
-Requires-Dist: torchvision; extra == "all"
 Requires-Dist: tqdm; extra == "all"
 Requires-Dist: transformers>=4.33; extra == "all"
 Requires-Dist: word2number; extra == "all"
@@ -102,6 +101,7 @@ Requires-Dist: iopath; extra == "all"
 Requires-Dist: omegaconf; extra == "all"
 Requires-Dist: open-clip-torch; extra == "all"
 Requires-Dist: opencv-python; extra == "all"
+Requires-Dist: torchvision; extra == "all"
 Provides-Extra: app
 Requires-Dist: gradio==5.4.0; extra == "app"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
@@ -230,6 +230,7 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
+- 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
 - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
 - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
 - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).

evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.1py3-none-any.whl → 0.16.2py3-none-any.whl