PyPI - evalscope - Versions diffs - 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

evalscope 0.15.1py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (46) hide show

evalscope/arguments.py +10 -0
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
evalscope/benchmarks/data_adapter.py +4 -2
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +133 -0
evalscope/benchmarks/drop/utils.py +59 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
evalscope/benchmarks/tool_bench/utils.py +202 -0
evalscope/benchmarks/utils.py +3 -2
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
evalscope/collections/evaluator.py +76 -26
evalscope/config.py +46 -15
evalscope/evaluator/evaluator.py +43 -15
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +3 -3
evalscope/metrics/rouge_metric.py +11 -13
evalscope/models/adapters/chat_adapter.py +51 -34
evalscope/models/adapters/server_adapter.py +15 -19
evalscope/perf/arguments.py +14 -5
evalscope/perf/benchmark.py +0 -6
evalscope/perf/main.py +65 -15
evalscope/perf/utils/benchmark_util.py +33 -15
evalscope/perf/utils/db_util.py +25 -15
evalscope/perf/utils/log_utils.py +1 -1
evalscope/perf/utils/rich_display.py +186 -0
evalscope/report/app.py +47 -34
evalscope/report/utils.py +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/deprecation_utils.py +42 -0
evalscope/version.py +2 -2
{evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/METADATA +45 -21
{evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/RECORD +46 -36
tests/cli/test_all.py +3 -0
tests/cli/test_collection.py +2 -1
tests/cli/test_run.py +28 -12
tests/perf/test_perf.py +23 -0
{evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
{evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
{evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.15.1.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0

evalscope/collections/evaluator.py CHANGED Viewed

@@ -7,7 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from copy import deepcopy
 from tabulate import tabulate
 from tqdm import tqdm
-from typing import List
+from typing import Any, Dict, List
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.collections.sampler import DatasetEntry
@@ -190,21 +190,24 @@ class EvaluatorCollection:
         answer_dict = defaultdict(dict)
         if self.task_cfg.use_cache and os.path.exists(pred_file_path):
             answers_list = jsonl_to_list(pred_file_path)
+            # Create a set of sample indices for which we have answers
             indices = set()
             for answer in answers_list:
                 index = answer.get(AnswerKeys.INDEX)
                 answer_dict[index] = answer
                 indices.add(index)
-            data = []
-            for sample in self.dataset:
-                if sample.index not in indices:
-                    data.append(sample)
+            # Filter dataset to only include samples that don't have answers
+            data = [sample for sample in self.dataset if sample.index not in indices]
+            # Initialize name map for the filtered dataset
             data_map = self._init_name_map(data)
             logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
             return answer_dict, data, data_map
-        return answer_dict, self.dataset, self.dataset_name_map
+        else:
+            # If cache isn't enabled or file doesn't exist, return the full dataset
+            return answer_dict, self.dataset, self.dataset_name_map
     def get_answers(self):
         pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
@@ -214,13 +217,16 @@ class EvaluatorCollection:
         answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
         eval_batch_size = self.task_cfg.eval_batch_size
+        # Process samples and get answers
         with tqdm(total=len(dataset), desc='Getting answers') as pbar:
             if self.task_cfg.eval_type == EvalType.SERVICE:
+                # Create a thread pool for parallel processing
                 with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
                     futures = []
                     for sample in dataset:
                         evaluator = self.evaluators[sample.dataset_name]
                         futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
+                    # Process completed tasks
                     for future in as_completed(futures):
                         answer_list, samples = future.result()
                         answers[samples[0].index] = answer_list[0]
@@ -244,35 +250,79 @@ class EvaluatorCollection:
                             pbar.update(len(batch_ids))
         return answers
-    def get_reviews(self, answers):
+    def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
+        """
+        Retrieve or generate reviews for given answers.
+        Args:
+            answers: Dictionary of answers indexed by sample index.
+        Returns:
+            Dictionary of reviews indexed by sample index.
+        """
+        # Set up the review file path
         review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
         os.makedirs(review_file_path, exist_ok=True)
-        if self.task_cfg.use_cache and os.path.exists(review_file_path):
-            logger.warning(
-                f'Ignore use_cache={self.task_cfg.use_cache}, updating the review file: {review_file_path} ...')
-            if os.path.isdir(review_file_path):
-                for filename in os.listdir(review_file_path):
-                    file_path = os.path.join(review_file_path, filename)
-                    try:
-                        if os.path.isfile(file_path):
-                            os.remove(file_path)
-                    except Exception as e:
-                        logger.error(f'Error deleting file {file_path}: {e}')
+        review_history_map = defaultdict(dict)
+        # Handle caching logic
+        if os.path.exists(review_file_path):
+            if not self.task_cfg.use_cache:
+                # Clear existing reviews if not using cache
+                self._clear_review_files(review_file_path)
             else:
-                os.remove(review_file_path)
+                # Load existing reviews if using cache
+                self._load_existing_reviews(review_file_path, review_history_map)
-        reviews = defaultdict(dict)
+        reviews = {}
         for sample in tqdm(self.dataset, desc='Getting reviews'):
-            evaluator = self.evaluators[sample.dataset_name]
-            review_d = evaluator.get_review(answers[sample.index])
+            file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
+            if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
+                # Use cached review if available
+                review_d = review_history_map[file_name][sample.index]
+            else:
+                # Generate new review
+                evaluator = self.evaluators[sample.dataset_name]
+                review_d = evaluator.get_review(answers[sample.index])
+                # Only save the review if it's not in the cache
+                self._save_review(review_file_path, file_name, review_d)
             reviews[sample.index] = review_d
-            dump_jsonl_data(
-                review_d,
-                os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
-                dump_mode=DumpMode.APPEND)
         return reviews
+    def _clear_review_files(self, review_file_path: str) -> None:
+        """Clear existing review files."""
+        if os.path.isdir(review_file_path):
+            for filename in os.listdir(review_file_path):
+                file_path = os.path.join(review_file_path, filename)
+                try:
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+                except Exception as e:
+                    logger.error(f'Error deleting file {file_path}: {e}')
+        else:
+            os.remove(review_file_path)
+    def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
+        """Load existing reviews from files."""
+        logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
+        if os.path.isdir(review_file_path):
+            for filename in os.listdir(review_file_path):
+                if '.ipynb_checkpoints' in filename:
+                    continue
+                file_path = os.path.join(review_file_path, filename)
+                with open(file_path, 'r') as f:
+                    review_history = [json.loads(line.strip()) for line in f]
+                review_history_map[filename] = {item['index']: item for item in review_history}
+    def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
+        """Save a single review to file."""
+        file_path = os.path.join(review_file_path, file_name)
+        dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
     def get_scores(self, reviews) -> float:
         scores = defaultdict(dict)
         for sample in tqdm(self.dataset, desc='Getting scores'):

evalscope/config.py CHANGED Viewed

@@ -18,24 +18,14 @@ logger = get_logger()
 cur_path = os.path.dirname(os.path.abspath(__file__))
-DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'}
-DEFAULT_GENERATION_CONFIG = {
-    'max_length': 2048,
-    'max_new_tokens': 512,
-    'do_sample': False,
-    'top_k': 50,
-    'top_p': 1.0,
-    'temperature': 1.0,
-}
 @dataclass
 class TaskConfig:
     # Model-related arguments
     model: Union[str, 'CustomModel', None] = None
     model_id: Optional[str] = None
-    model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
-    model_task: Optional[str] = ModelTask.TEXT_GENERATION
+    model_args: Dict = field(default_factory=dict)
+    model_task: str = ModelTask.TEXT_GENERATION
     # Template-related arguments
     template_type: Optional[str] = None  # Deprecated, will be removed in v1.0.0.
@@ -48,7 +38,7 @@ class TaskConfig:
     dataset_hub: str = HubType.MODELSCOPE
     # Generation configuration arguments
-    generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
+    generation_config: Dict = field(default_factory=dict)
     # Evaluation-related arguments
     eval_type: str = EvalType.CHECKPOINT
@@ -65,6 +55,7 @@ class TaskConfig:
     outputs: Optional[str] = None  # Deprecated, will be removed in v1.0.0.
     # Debug and runtime mode arguments
+    ignore_errors: bool = False
     debug: bool = False
     dry_run: bool = False
     seed: Optional[int] = 42
@@ -95,6 +86,46 @@ class TaskConfig:
         if self.eval_batch_size is None:
             self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
+        # Set default generation_config and model_args
+        self.__init_default_generation_config()
+        self.__init_default_model_args()
+    def __init_default_generation_config(self):
+        if self.generation_config:
+            return
+        if self.model_task == ModelTask.IMAGE_GENERATION:
+            self.generation_config = {
+                'height': 1024,
+                'width': 1024,
+                'num_inference_steps': 50,
+                'guidance_scale': 9.0,
+            }
+        elif self.model_task == ModelTask.TEXT_GENERATION:
+            if self.eval_type == EvalType.CHECKPOINT:
+                self.generation_config = {
+                    'max_length': 2048,
+                    'max_new_tokens': 512,
+                    'do_sample': False,
+                    'top_k': 50,
+                    'top_p': 1.0,
+                    'temperature': 1.0,
+                }
+            elif self.eval_type == EvalType.SERVICE:
+                self.generation_config = {
+                    'max_tokens': 2048,
+                    'temperature': 0.0,
+                }
+    def __init_default_model_args(self):
+        if self.model_args:
+            return
+        if self.model_task == ModelTask.TEXT_GENERATION:
+            if self.eval_type == EvalType.CHECKPOINT:
+                self.model_args = {
+                    'revision': 'master',
+                    'precision': 'torch.float16',
+                }
     def to_dict(self):
         result = self.__dict__.copy()
         if isinstance(self.model, CustomModel):
@@ -218,9 +249,9 @@ def parse_task_config(task_cfg) -> TaskConfig:
     elif isinstance(task_cfg, str):
         extension = os.path.splitext(task_cfg)[-1]
         logger.info(f'Args: Task config is provided with {extension} file type.')
-        if extension in ['yaml', 'yml']:
+        if extension in ['.yaml', '.yml']:
             task_cfg = TaskConfig.from_yaml(task_cfg)
-        elif extension == 'json':
+        elif extension == '.json':
             task_cfg = TaskConfig.from_json(task_cfg)
         else:
             raise ValueError('Args: Unsupported file extension.')

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -97,13 +97,23 @@ class Evaluator(object):
         answer_d[AnswerKeys.ANSWER_ID] = answer_id
         answer_d[AnswerKeys.SUBSET_NAME] = subset_name
         answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
-        # answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
         answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
         return answer_d
     def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
         answers_list = []
-        answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
+        try:
+            # get answer from model
+            answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
+        except Exception as e:
+            logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
+            # if ignore_errors is True, continue to next input
+            if self.task_cfg.ignore_errors:
+                logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
+                return answers_list
+            else:
+                raise e
+        # process answer
         for answer_d, input_prompt in zip(answer_ds, input_prompts):
             answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
             processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
@@ -197,16 +207,17 @@ class Evaluator(object):
             reviewer_spec = {}
         review_res = deepcopy(answer_d)
-        choices = review_res[AnswerKeys.CHOICES]
-        if len(choices) == 0:
-            review_res[ReviewKeys.REVIEWED] = False
+        if AnswerKeys.CHOICES not in review_res:
+            review_res[AnswerKeys.CHOICES] = []
+            review_res[ReviewKeys.REVIEWED] = True
             review_res[ReviewKeys.REVIEW_ID] = None
             review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
             review_res[ReviewKeys.REVIEW_TIME] = time.time()
+            logger.warning(f'No choices found for answer dict: {review_res}')
             return review_res
         rev_choices = []
-        for choice in choices:
+        for choice in review_res[AnswerKeys.CHOICES]:
             raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
             answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
             gold_content = self.data_adapter.get_gold_answer(raw_input_d)
@@ -280,11 +291,20 @@ class Evaluator(object):
         review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
         os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
+        # Load existing reviews if using cache
+        existing_reviews = {}
         if self.use_cache and os.path.exists(review_file_path):
-            logger.info(f'Updating the review file: {review_file_path} ...')
-            os.remove(review_file_path)
+            with open(review_file_path, 'r') as f:
+                for line in f:
+                    review = json.loads(line.strip())
+                    existing_reviews[review['index']] = review
+            logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
         def process_single_review(answer_d):
+            # Check if review already exists in cache
+            if self.use_cache and answer_d['index'] in existing_reviews:
+                return existing_reviews[answer_d['index']]
             review_id, reviewer_spec = self._generate_review_id(answer_d)
             # Get review
             review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
@@ -299,8 +319,9 @@ class Evaluator(object):
             for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
                 review_d = future.result()
                 reviews_list.append(review_d)
-                # Dump reviews
-                dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
+                # Dump new reviews only if not using cache or review is new
+                if not self.use_cache or review_d['index'] not in existing_reviews:
+                    dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
         return reviews_list
@@ -315,17 +336,24 @@ class Evaluator(object):
         Returns:
             The metric result. Depends on the metric function in data_adapter.
         """
+        # Get max choices
+        choices_lengths = [
+            len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d.get(ReviewKeys.REVIEWED)
+        ]
+        if choices_lengths:
+            max_choices = max(choices_lengths)
+        else:
+            max_choices = 0
+        # Get review result
         review_res_list = []
-        max_choices = max(
-            len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
         for review_d in reviews_list:
             if not review_d[ReviewKeys.REVIEWED]:
-                logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
+                logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
                 continue
             if len(review_d[AnswerKeys.CHOICES]) == 0:
-                logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
+                logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
                 continue
             elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
                 review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
@@ -367,7 +395,7 @@ class Evaluator(object):
         os.makedirs(os.path.dirname(report_path), exist_ok=True)
         # Write report
-        with open(report_path, 'w') as f:
+        with open(report_path, 'w', encoding='utf-8') as f:
             f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
         logger.info(f'Dump report: {report_path} \n')

evalscope/metrics/bundled_rouge_score/rouge_scorer.py CHANGED Viewed

@@ -44,20 +44,25 @@ from evalscope.utils import get_logger
 logger = get_logger()
-# Deal with nltk punkt_tab.zip tokenizer file to avoid downloading issue
-try:
-    nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
-    os.makedirs(nltk_dir, exist_ok=True)
-    punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
-    punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
-    if not os.path.exists(punkt_path):
-        os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
-        os.system(f'unzip {punkt_path} -d {nltk_dir}')
-    else:
-        logger.debug(f'{punkt_path} already exists, skipping download')
-except Exception as e:
-    logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
+def check_nltk_data():
+    """
+    Check if nltk data is available in the system.
+    If not, download the necessary data files.
+    """
+    try:
+        nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
+        os.makedirs(nltk_dir, exist_ok=True)
+        punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
+        punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
+        if not os.path.exists(punkt_path):
+            os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
+            os.system(f'unzip {punkt_path} -d {nltk_dir}')
+        else:
+            logger.debug(f'{punkt_path} already exists, skipping download')
+    except Exception as e:
+        logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
 class RougeScorer(scoring.BaseScorer):
@@ -83,7 +88,7 @@ class RougeScorer(scoring.BaseScorer):
     """
     def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
+        check_nltk_data()
         self.rouge_types = rouge_types
         if tokenizer:
             self._tokenizer = tokenizer

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -59,13 +59,13 @@ class LLMJudge:
         # Initialize ServerModelAdapter
         self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
-    def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> float:
+    def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
         """
         Args:
             prompt (str): The prompt to evaluate
             system_prompt (str, optional): The system prompt to use for the evaluation
         Returns:
-            float: The score of the evaluation
+            str: The response from the LLM
         """
         input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
@@ -83,7 +83,7 @@ class LLMJudge:
             return llm_response
         except Exception as e:
             logger.error(f'Error during LLM evaluation: {e}')
-            return None
+            return ''
     def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
         if question is None:

evalscope/metrics/rouge_metric.py CHANGED Viewed

@@ -19,10 +19,6 @@ class DummyTokenizer:
         return text.split()
-scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
-zh_scorer = Rouge()
 def is_contains_chinese(strs):
     for _char in strs:
         if '\u4e00' <= _char <= '\u9fa5':
@@ -51,6 +47,7 @@ def compute_rouge_score(predict_l, reference_l):
 def compute_rouge_score_one_sample_zh(predict, reference):
     result = dict()
+    zh_scorer = Rouge()
     for p, r in zip(predict, reference):
         p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
         r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
@@ -60,21 +57,22 @@ def compute_rouge_score_one_sample_zh(predict, reference):
         except Exception as e:
             logger.warning(f'rouge score error: {p} {r} {e}')
             continue
-        result['rouge-1-r'] = score['rouge-1']['r']
-        result['rouge-1-p'] = score['rouge-1']['p']
-        result['rouge-1-f'] = score['rouge-1']['f']
-        result['rouge-2-r'] = score['rouge-2']['r']
-        result['rouge-2-p'] = score['rouge-2']['p']
-        result['rouge-2-f'] = score['rouge-2']['f']
-        result['rouge-l-r'] = score['rouge-l']['r']
-        result['rouge-l-p'] = score['rouge-l']['p']
-        result['rouge-l-f'] = score['rouge-l']['f']
+        result['Rouge-1-R'] = score['rouge-1']['r']
+        result['Rouge-1-P'] = score['rouge-1']['p']
+        result['Rouge-1-F'] = score['rouge-1']['f']
+        result['Rouge-2-R'] = score['rouge-2']['r']
+        result['Rouge-2-P'] = score['rouge-2']['p']
+        result['Rouge-2-F'] = score['rouge-2']['f']
+        result['Rouge-L-R'] = score['rouge-l']['r']
+        result['Rouge-L-P'] = score['rouge-l']['p']
+        result['Rouge-L-F'] = score['rouge-l']['f']
     return result
 def compute_rouge_score_one_sample(predict, reference):
     result = dict()
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
     for p, r in zip(predict, reference):
         try:
             score = scorer.score(p, r)

evalscope/models/adapters/chat_adapter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import time
 import torch
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
 from evalscope.utils.logger import get_logger
@@ -58,19 +58,15 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         return generation_config
     def _model_generate(self,
-                        queries: List[str],
-                        system_prompts: List[str] = None,
+                        formatted_prompts: List[str],
                         infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
         """
         Args:
-            queries: The input queries.
-            system_prompts: The system prompts.
+            formatted_prompts: The formatted prompts.
             infer_cfg: The inference configuration.
         Returns:
             The prediction results.
         """
-        if system_prompts is None:
-            system_prompts = []
         if infer_cfg is None:
             infer_cfg = {}
@@ -92,27 +88,6 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         self.generation_config.update(**infer_cfg)
         fix_do_sample_warning(self.generation_config)
-        # For chat model, use the chat template to format the input
-        if self.tokenizer.chat_template is not None:
-            formatted_prompts = []
-            for i, query in enumerate(queries):
-                messages = [ChatMessage(role='user', content=query)]
-                if i < len(system_prompts) and system_prompts[i]:
-                    messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
-                # whether thinking is needed
-                chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
-                if chat_template_kwargs is not None:
-                    prompts = self.tokenizer.apply_chat_template(
-                        messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
-                else:
-                    prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-                formatted_prompts.append(prompts)
-        else:
-            # For base model, use the queries as the input
-            formatted_prompts = queries
-        logger.debug(f'formatted_prompts: {formatted_prompts}')
         # Get input ids
         inputs = self.tokenizer(
             formatted_prompts, return_tensors='pt', padding=True, truncation=True,
@@ -136,26 +111,68 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         return responses, input_lengths
-    @torch.no_grad()
-    def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
+    def _prepare_inputs(self, inputs: List[dict], infer_cfg: dict = {}) -> List[str]:
         """
+        Prepare the inputs for the model.
         Args:
             inputs: The input data.
             infer_cfg: The inference configuration.
         Returns:
-            The prediction results.
+            The prepared inputs and system prompts.
         """
-        # Process inputs
         queries = []
         system_prompts = []
+        message_list = []
         for input_item in inputs:
             queries.append(input_item['data'][0])
             system_prompts.append(input_item.get('system_prompt', None))
+            if input_item.get('messages', None):
+                message_list.append(input_item.get('messages', None))
+        # For non chat model, use the original queries as the input
+        if self.tokenizer.chat_template is None:
+            return queries
+        # For chat model, use the messages as the input
+        # if message_list is None, use the queries as the input
+        if len(message_list) == 0:
+            for i, query in enumerate(queries):
+                messages = [ChatMessage(role='user', content=query)]
+                if i < len(system_prompts) and system_prompts[i]:
+                    messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
+                message_list.append(messages)
+        # Format the messages
+        formatted_prompts = []
+        for messages in message_list:
+            # apply chat template
+            chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
+            if chat_template_kwargs is not None:
+                prompts = self.tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
+            else:
+                prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            formatted_prompts.append(prompts)
+        logger.debug(f'formatted_prompts: {formatted_prompts}')
+        return formatted_prompts
+    @torch.no_grad()
+    def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = {}) -> List[dict]:
+        """
+        Args:
+            inputs: The input data.
+            infer_cfg: The inference configuration.
+        Returns:
+            The prediction results.
+        """
+        # Process inputs
+        formatted_prompts = self._prepare_inputs(inputs, infer_cfg)
         # Run inference
-        responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
+        responses, input_lengths = self._model_generate(formatted_prompts, infer_cfg)
         # Process outputs
         results = []

evalscope 0.15.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.15.1py3-none-any.whl → 0.16.0py3-none-any.whl