PyPI - evalscope - Versions diffs - 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

evalscope/__init__.py +2 -0
evalscope/arguments.py +11 -3
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/__init__.py +20 -1
evalscope/benchmarks/arc/__init__.py +0 -5
evalscope/benchmarks/arc/arc_adapter.py +24 -102
evalscope/benchmarks/bbh/__init__.py +0 -4
evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
evalscope/benchmarks/benchmark.py +70 -59
evalscope/benchmarks/ceval/__init__.py +0 -5
evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
evalscope/benchmarks/cmmlu/__init__.py +0 -5
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
evalscope/benchmarks/competition_math/__init__.py +0 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
evalscope/benchmarks/data_adapter.py +115 -87
evalscope/benchmarks/general_qa/__init__.py +0 -5
evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
evalscope/benchmarks/gsm8k/__init__.py +0 -4
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
evalscope/benchmarks/hellaswag/__init__.py +0 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
evalscope/benchmarks/humaneval/__init__.py +0 -4
evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
evalscope/benchmarks/ifeval/instructions.py +1478 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/__init__.py +0 -5
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
evalscope/benchmarks/race/__init__.py +0 -5
evalscope/benchmarks/race/race_adapter.py +26 -123
evalscope/benchmarks/trivia_qa/__init__.py +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
evalscope/benchmarks/truthful_qa/__init__.py +0 -5
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +29 -0
evalscope/collections/__init__.py +3 -0
evalscope/collections/evaluator.py +198 -0
evalscope/collections/sampler.py +138 -0
evalscope/collections/schema.py +126 -0
evalscope/config.py +7 -5
evalscope/constants.py +9 -26
evalscope/evaluator/evaluator.py +87 -121
evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
evalscope/metrics/__init__.py +3 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/math_accuracy.py +193 -50
evalscope/metrics/metrics.py +18 -6
evalscope/metrics/named_metrics.py +17 -0
evalscope/metrics/rouge_metric.py +13 -8
evalscope/models/__init__.py +14 -1
evalscope/models/base_adapter.py +52 -0
evalscope/models/chat_adapter.py +138 -0
evalscope/models/choice_adapter.py +211 -0
evalscope/models/custom_adapter.py +67 -0
evalscope/models/local_model.py +74 -0
evalscope/models/model.py +141 -0
evalscope/models/server_adapter.py +111 -0
evalscope/perf/__init__.py +1 -0
evalscope/perf/main.py +0 -1
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +1 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +506 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +48 -72
evalscope/run_arena.py +1 -1
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +1 -1
evalscope/utils/chat_service.py +5 -4
evalscope/utils/io_utils.py +8 -0
evalscope/utils/logger.py +5 -0
evalscope/utils/model_utils.py +15 -2
evalscope/utils/utils.py +3 -25
evalscope/version.py +2 -2
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
tests/cli/test_collection.py +57 -0
tests/cli/test_run.py +52 -1
tests/rag/test_mteb.py +3 -2
evalscope/models/api/__init__.py +0 -3
evalscope/models/dummy_chat_model.py +0 -49
evalscope/models/model_adapter.py +0 -525
evalscope/models/openai_model.py +0 -103
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -133
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
/evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0

evalscope/metrics/metrics.py CHANGED Viewed

@@ -1,18 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) EleutherAI. and its affiliates.
 # Copyright (c) OpenAI. and its affiliates.
 import itertools
-import jieba
 import math
 import numpy as np
 import random
 import sacrebleu
-import sklearn.metrics
 from collections import defaultdict
 from collections.abc import Iterable
-from nltk import word_tokenize
-from nltk.translate.bleu_score import sentence_bleu
-from typing import Dict, List, Union
+from typing import TYPE_CHECKING, Dict, List, Union
 def mean(arr):
@@ -38,6 +35,8 @@ def median(arr):
 def matthews_corrcoef(items):
+    import sklearn.metrics
     unzipped_list = list(zip(*items))
     golds = unzipped_list[0]
     preds = unzipped_list[1]
@@ -45,6 +44,8 @@ def matthews_corrcoef(items):
 def f1_score(items):
+    import sklearn.metrics
     unzipped_list = list(zip(*items))
     golds = unzipped_list[0]
     preds = unzipped_list[1]
@@ -103,12 +104,20 @@ def perplexity(items):
     return math.exp(-mean(items))
-def weighted_mean(items) -> float:
+def weighted_mean(items: List) -> float:
     # e.g. [(0,1), (0.5,1), (1,1)]
     a, b = zip(*items)
     return sum(a) / sum(b)
+def micro_mean(items):
+    return sum([item.score * item.num for item in items]) / sum([item.num for item in items])
+def macro_mean(items):
+    return sum([item.score for item in items]) / len(items)
 def weighted_perplexity(items):
     return math.exp(-weighted_mean(items))
@@ -150,6 +159,9 @@ def bleu_ngram_one_sample(predict, reference):
         }
     """
+    import jieba
+    from nltk import word_tokenize
+    from nltk.translate.bleu_score import sentence_bleu
     def is_contains_chinese(strs):
         for _char in strs:

evalscope/metrics/named_metrics.py ADDED Viewed

@@ -0,0 +1,17 @@
+from dataclasses import dataclass, field
+from typing import Callable
+from evalscope.metrics.metrics import mean, weighted_mean
+@dataclass
+class Metric:
+    name: str = 'default_metric'
+    object: Callable = field(default_factory=lambda: mean)
+AverageAccuracy = Metric(name='AverageAccuracy', object=mean)
+WeightedAverageAccuracy = Metric(name='WeightedAverageAccuracy', object=weighted_mean)
+AverageBLEU = Metric(name='AverageBLEU', object=mean)
+WeightedAverageBLEU = Metric(name='WeightedAverageBLEU', object=weighted_mean)
+Pass1 = Metric(name='Pass@1', object=mean)

evalscope/metrics/rouge_metric.py CHANGED Viewed

@@ -1,15 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import jieba
-import logging
 from collections import defaultdict
-from pathlib import Path
 from rouge_chinese import Rouge
 from statistics import mean
 from tqdm import tqdm
 from evalscope.constants import MetricsConstant
 from evalscope.metrics.bundled_rouge_score import rouge_scorer
+from evalscope.utils.logger import get_logger
+logger = get_logger()
 class DummyTokenizer:
@@ -18,10 +19,6 @@ class DummyTokenizer:
         return text.split()
-HERE = Path(__file__).absolute().parent
-logger = logging.getLogger(__name__)
 scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
 zh_scorer = Rouge()
@@ -58,7 +55,11 @@ def compute_rouge_score_one_sample_zh(predict, reference):
         p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
         r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
-        score = zh_scorer.get_scores(p, r)[0]
+        try:
+            score = zh_scorer.get_scores(p, r, ignore_empty=True)[0]
+        except Exception as e:
+            logger.warning(f'rouge score error: {p} {r} {e}')
+            continue
         result['rouge-1-r'] = score['rouge-1']['r']
         result['rouge-1-p'] = score['rouge-1']['p']
         result['rouge-1-f'] = score['rouge-1']['f']
@@ -75,7 +76,11 @@ def compute_rouge_score_one_sample_zh(predict, reference):
 def compute_rouge_score_one_sample(predict, reference):
     result = dict()
     for p, r in zip(predict, reference):
-        score = scorer.score(p, r)
+        try:
+            score = scorer.score(p, r)
+        except Exception as e:
+            logger.warning(f'rouge score error: {p} {r} {e}')
+            continue
         result['rouge-1-r'] = score['rouge1'].recall
         result['rouge-1-p'] = score['rouge1'].precision
         result['rouge-1-f'] = score['rouge1'].fmeasure

evalscope/models/__init__.py CHANGED Viewed

@@ -1,3 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.models.model import BaseModel, ChatBaseModel
+from evalscope.models.base_adapter import BaseModelAdapter, initialize_model_adapter
+from evalscope.models.chat_adapter import ChatGenerationModelAdapter
+from evalscope.models.choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
+from evalscope.models.custom import CustomModel
+from evalscope.models.custom_adapter import CustomModelAdapter
+from evalscope.models.local_model import LocalModel, get_local_model
+from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
+from evalscope.models.server_adapter import ServerModelAdapter
+__all__ = [
+    'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter',
+    'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter',
+    'LocalModel', 'get_local_model', 'initialize_model_adapter'
+]

evalscope/models/base_adapter.py ADDED Viewed

@@ -0,0 +1,52 @@
+import torch
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional, Union
+from evalscope.constants import EvalType
+from evalscope.models.custom import CustomModel
+from evalscope.models.local_model import LocalModel
+if TYPE_CHECKING:
+    from evalscope.config import TaskConfig
+class BaseModelAdapter(ABC):
+    def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
+        if model is None:
+            self.model_cfg = kwargs.get('model_cfg', None)
+        elif isinstance(model, LocalModel):
+            self.model = model.model
+            self.model_id = model.model_id
+            self.model_revision = model.model_revision
+            self.device = model.device
+            self.tokenizer = model.tokenizer
+            self.model_cfg = model.model_cfg
+        elif isinstance(model, CustomModel):
+            self.model_cfg = model.config
+        else:
+            raise ValueError(f'Unsupported model type: {type(model)}')
+    @abstractmethod
+    @torch.no_grad()
+    def predict(self, *args, **kwargs) -> Any:
+        raise NotImplementedError
+def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseModelAdapter', base_model: 'LocalModel'):
+    """Initialize the model adapter based on the task configuration."""
+    if task_cfg.dry_run:
+        from evalscope.models.model import DummyChatModel
+        return DummyChatModel(model_cfg=dict())
+    elif task_cfg.eval_type == EvalType.CUSTOM:
+        if not isinstance(task_cfg.model, CustomModel):
+            raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
+        from evalscope.models import CustomModelAdapter
+        return CustomModelAdapter(custom_model=task_cfg.model)
+    elif task_cfg.eval_type == EvalType.SERVICE:
+        from evalscope.models import ServerModelAdapter
+        return ServerModelAdapter(
+            api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key, seed=task_cfg.seed)
+    else:
+        return model_adapter_cls(
+            model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)

evalscope/models/chat_adapter.py ADDED Viewed

@@ -0,0 +1,138 @@
+import os
+import time
+import torch
+from typing import Union
+from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.local_model import LocalModel
+from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
+from evalscope.utils.logger import get_logger
+from evalscope.utils.model_utils import fix_do_sample_warning
+logger = get_logger()
+class ChatGenerationModelAdapter(BaseModelAdapter):
+    """
+    Chat generation model adapter.
+    """
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model)
+        self.generation_config = self._parse_generation_config(self.tokenizer, self.model)
+        custom_generation_config = kwargs.pop('generation_config', None)
+        custom_chat_template = kwargs.pop('chat_template', None)
+        if custom_generation_config:
+            logger.info('Updating generation config ...')
+            self.generation_config.update(**custom_generation_config)
+        if custom_chat_template:
+            self.tokenizer.chat_template = custom_chat_template
+            logger.info(f'Using custom chat template: {custom_chat_template}')
+    def _parse_generation_config(self, tokenizer, model):
+        from modelscope import GenerationConfig
+        generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
+        try:
+            remote_config = GenerationConfig.from_pretrained(
+                self.model_id, revision=self.model_revision, trust_remote_code=True)
+            generation_config.update(**remote_config.to_dict())
+        except Exception:
+            logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
+        if isinstance(self.model_id, str) and os.path.exists(self.model_id):
+            logger.warning(f'Got local model dir: {self.model_id}')
+        if tokenizer.eos_token_id is not None:
+            generation_config.eos_token_id = tokenizer.eos_token_id
+        if tokenizer.pad_token_id is not None:
+            generation_config.pad_token_id = tokenizer.pad_token_id
+        if generation_config.max_new_tokens is None:
+            generation_config.max_new_tokens = 2048
+        return generation_config
+    def _model_generate(self, query: str, system_prompt: str = None, infer_cfg: dict = {}) -> str:
+        """
+        Args:
+            query: The input query.
+            system_prompt: The system prompt.
+            infer_cfg: The inference configuration.
+        Returns:
+            The prediction result.
+        """
+        # For chat model, use the chat template to format the input
+        if self.tokenizer.chat_template is not None:
+            messages = [ChatMessage(role='user', content=query)]
+            if system_prompt:
+                messages = [ChatMessage(role='system', content=system_prompt)] + messages
+            formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        else:
+            # For base model, use the query as the input
+            formatted_prompt = query
+        inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
+        input_ids = inputs['input_ids']
+        # Process infer_cfg
+        if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
+            infer_cfg['do_sample'] = True
+        # stop settings
+        stop = infer_cfg.get('stop', None)
+        eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
+            if stop else self.tokenizer.eos_token_id
+        if eos_token_id is not None:
+            infer_cfg['eos_token_id'] = eos_token_id
+            infer_cfg['pad_token_id'] = eos_token_id  # setting eos_token_id as pad token
+        self.generation_config.update(**infer_cfg)
+        fix_do_sample_warning(self.generation_config)
+        # Run inference
+        output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
+        response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
+        return response
+    @torch.no_grad()
+    def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
+        """
+        Args:
+            inputs: The input data.
+            infer_cfg: The inference configuration.
+        Returns:
+            The prediction result.
+        """
+        # Process inputs
+        if isinstance(inputs, str):
+            query = inputs
+            system_prompt = None
+        elif isinstance(inputs, dict):
+            query = inputs['data'][0]
+            system_prompt = inputs.get('system_prompt', None)
+        elif isinstance(inputs, list):
+            query = '\n'.join(inputs)
+            system_prompt = None
+        else:
+            raise TypeError(f'Unsupported inputs type: {type(inputs)}')
+        response = self._model_generate(query, system_prompt, infer_cfg)
+        choices_list = [
+            ChatCompletionResponseChoice(
+                index=0, message=ChatMessage(content=response, role='assistant'), finish_reason='stop')
+        ]
+        res_d = ChatCompletionResponse(
+            model=self.model_id, choices=choices_list, object='chat.completion', created=int(time.time()),
+            usage=None).model_dump(exclude_unset=True)
+        return res_d

evalscope/models/choice_adapter.py ADDED Viewed

@@ -0,0 +1,211 @@
+import numpy as np
+import time
+import torch
+from typing import List
+from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.local_model import LocalModel
+from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
+class MultiChoiceModelAdapter(BaseModelAdapter):
+    """ The multi-choice model adapter. """
+    _DEFAULT_MAX_LENGTH = 2048
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model)
+        self._max_length = kwargs.get('max_length')
+    @property
+    def max_length(self):
+        if self._max_length:
+            return self._max_length
+        seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, 'model_max_length'):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+    @torch.no_grad()
+    def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
+        """
+        Multi-choice model prediction func.
+        Args:
+            inputs (dict): The inputs for a doc. Format:
+                {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
+            infer_cfg (dict): inference configuration.
+        Returns:
+            res (dict): The model prediction results. Format:
+            {
+              'choices': [
+                {
+                  'index': 0,
+                  'message': {
+                    'content': [-14.9609, -13.6015, ...],  # loglikelihood values for inputs context-continuation pairs.
+                    'role': 'assistant'
+                  }
+                }
+              ],
+              'created': 1677664795,
+              # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
+              'model': 'gpt-3.5-turbo-0613',
+              'object': 'chat.completion',
+              'usage': {
+                'completion_tokens': 17,
+                'prompt_tokens': 57,
+                'total_tokens': 74
+              }
+            }
+        """
+        infer_cfg = infer_cfg or {}
+        self.model.generation_config.update(**infer_cfg)
+        input_data = inputs['data']
+        multi_choices = inputs['multi_choices']
+        output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
+        assert output.shape[0] == 1
+        logits = output.flatten()
+        choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
+        softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
+        if softval.dtype in {torch.bfloat16, torch.float16}:
+            softval = softval.to(dtype=torch.float32)
+        probs = softval.detach().cpu().numpy()
+        pred: str = multi_choices[int(np.argmax(probs))]  # Format: A or B or C or D
+        res_d = ChatCompletionResponse(
+            model=self.model_id,
+            choices=[
+                ChatCompletionResponseChoice(
+                    index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
+            ],
+            object='chat.completion',
+            created=int(time.time()),
+            usage=None).model_dump(exclude_unset=True)
+        return res_d
+    @staticmethod
+    def _get_logits(tokenizer, model, inputs: List[str]):
+        input_ids = tokenizer(inputs, padding=False)['input_ids']
+        input_ids = torch.tensor(input_ids, device=model.device)
+        tokens = {'input_ids': input_ids}
+        outputs = model(input_ids)['logits']
+        logits = outputs[:, -1, :]
+        log_probs = torch.nn.functional.softmax(logits, dim=-1)
+        return log_probs, {'tokens': tokens}
+class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
+    """
+    Continuation-logits model adapter.
+    """
+    def __init__(self, model: LocalModel, **kwargs):
+        super().__init__(model, **kwargs)
+    @torch.no_grad()
+    def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
+        """
+        Multi-choice model prediction func.
+        Args:
+            inputs (dict): The inputs for a doc. Format:
+                {'data': [(context, continuation), ...]}
+            infer_cfg (dict): inference configuration.
+        Returns:
+            res (dict): The model prediction results. Format:
+            {
+              'choices': [
+                {
+                  'index': 0,
+                  'message': {
+                    'content': [-14.9609, -13.6015, ...],  # loglikelihood values for inputs context-continuation pairs.
+                    'role': 'assistant'
+                  }
+                }
+              ],
+              'created': 1677664795,
+              # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
+              'model': 'gpt-3.5-turbo-0613',
+              'object': 'chat.completion',
+              'usage': {
+                'completion_tokens': 17,
+                'prompt_tokens': 57,
+                'total_tokens': 74
+              }
+            }
+        """
+        infer_cfg = infer_cfg or {}
+        pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
+        res_d = ChatCompletionResponse(
+            model=self.model_id,
+            choices=[{
+                'index': 0,
+                'message': {
+                    'content': pred_list,
+                    'role': 'assistant'
+                }
+            }],
+            object='chat.completion',
+            created=int(time.time()),
+            usage=None).model_dump(exclude_unset=True)
+        return res_d
+    def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
+        self.model.generation_config.update(**infer_cfg)
+        # To predict one doc
+        doc_ele_pred = []
+        for ctx, continuation in inputs:
+            # ctx_enc shape: [context_tok_len]  cont_enc shape: [continuation_tok_len]
+            ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
+            inputs_tokens = torch.tensor(
+                (ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
+                dtype=torch.long,
+                device=self.model.device).unsqueeze(0)
+            logits = self.model(inputs_tokens)[0]
+            logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
+            logits = logits[:, -len(cont_enc):, :]
+            cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
+            logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
+            choice_score = float(logits.sum())
+            doc_ele_pred.append(choice_score)
+        # e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
+        return doc_ele_pred
+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
+        whole_enc = torch.tensor(whole_enc, device=self.device)
+        context_enc = self.tokenizer(context, padding=False)['input_ids']
+        context_enc = torch.tensor(context_enc, device=self.device)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc

evalscope/models/custom_adapter.py ADDED Viewed

@@ -0,0 +1,67 @@
+from typing import Any, Dict, List, Union
+from evalscope.models.base_adapter import BaseModelAdapter
+from evalscope.models.custom import CustomModel
+class CustomModelAdapter(BaseModelAdapter):
+    def __init__(self, custom_model: CustomModel, **kwargs):
+        """
+        Custom model adapter.
+        Args:
+            custom_model: The custom model instance.
+            **kwargs: Other args.
+        """
+        self.custom_model = custom_model
+        super(CustomModelAdapter, self).__init__(model=custom_model)
+    def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
+        """
+        Model prediction func.
+        Args:
+            inputs (Union[str, dict, list]): The input data. Depending on the specific model.
+                str: 'xxx'
+                dict: {'data': [full_prompt]}
+                list: ['xxx', 'yyy', 'zzz']
+            **kwargs: kwargs
+        Returns:
+            res (dict): The model prediction results. Format:
+            {
+              'choices': [
+                {
+                  'index': 0,
+                  'message': {
+                    'content': 'xxx',
+                    'role': 'assistant'
+                  }
+                }
+              ],
+              'created': 1677664795,
+              'model': 'gpt-3.5-turbo-0613',   # should be model_id
+              'object': 'chat.completion',
+              'usage': {
+                'completion_tokens': 17,
+                'prompt_tokens': 57,
+                'total_tokens': 74
+              }
+            }
+        """
+        in_prompts = []
+        # Note: here we assume the inputs are all prompts for the benchmark.
+        for input_prompt in inputs:
+            if isinstance(input_prompt, str):
+                in_prompts.append(input_prompt)
+            elif isinstance(input_prompt, dict):
+                # TODO: to be supported for continuation list like truthful_qa
+                in_prompts.append(input_prompt['data'][0])
+            elif isinstance(input_prompt, list):
+                in_prompts.append('\n'.join(input_prompt))
+            else:
+                raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
+        return self.custom_model.predict(prompts=in_prompts, **kwargs)

evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl