PyPI - evalscope - Versions diffs - 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

evalscope 0.6.0rc0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

evalscope/backend/rag_eval/utils/clip.py ADDED Viewed

@@ -0,0 +1,149 @@
+import os
+import torch
+import torch.nn.functional as F
+from typing import List
+from PIL import Image
+from evalscope.backend.rag_eval.utils.tools import download_model, PIL_to_base64
+from transformers import AutoModel, AutoProcessor
+from langchain_core.embeddings import Embeddings
+class VisionModel:
+    @staticmethod
+    def load(**kw):
+        api_base = kw.get("api_base", None)
+        if api_base:
+            return VLMAPI(
+                model_name=kw.get("model_name", ""),
+                openai_api_base=api_base,
+                openai_api_key=kw.get("api_key", "EMPTY"),
+                prompt=kw.get("prompt", None),
+            )
+        else:
+            return CLIPModel(**kw)
+class VLMAPI:
+    def __init__(self, model_name, openai_api_base, openai_api_key, prompt=None):
+        from langchain_openai import ChatOpenAI
+        from langchain_core.prompts import ChatPromptTemplate
+        self.model_name = model_name
+        self.model = ChatOpenAI(
+            model_name=model_name,
+            openai_api_base=openai_api_base,
+            openai_api_key=openai_api_key,
+        )
+        self.default_prompt = "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'"
+        self.prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", prompt if prompt else self.default_prompt),
+                (
+                    "user",
+                    [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": "data:image/jpeg;base64,{image_data}"},
+                        }
+                    ],
+                ),
+            ]
+        )
+        self.chain = self.prompt | self.model
+        self.transform = PIL_to_base64
+    def encode_image(self, images):
+        captions = []
+        for image in images:
+            response = self.chain.invoke({"image_data": image})
+            captions.append(response.content)
+        return captions
+class CLIPModel(Embeddings):
+    def __init__(
+        self,
+        model_name: str,
+        revision: str = "master",
+        hub="modelscope",
+        device="cpu",
+    ):
+        self.device = device
+        self.model_name = model_name
+        self.revision = revision
+        # Download the model if it doesn't exist locally
+        if not os.path.exists(model_name) and hub == "modelscope":
+            model_name = download_model(self.model_name, self.revision)
+        # Load the model and processor
+        self.model = AutoModel.from_pretrained(model_name).to(self.device)
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.transform = self.processor.image_processor
+        self.tokenizer = self.processor.tokenizer
+    def encode_text(self, batch_texts: List[str] | List[List[str]]):
+        if isinstance(batch_texts[0], list):
+            batch_texts = [
+                text for _, texts in enumerate(batch_texts) for text in texts
+            ]
+        # Ensure that the input texts are within the token limit
+        max_length = self.tokenizer.model_max_length
+        if not max_length or max_length > 0xFFFFFF:
+            max_length = 512
+        encoded_inputs = self.tokenizer(
+            text=batch_texts,
+            max_length=max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
+        with torch.no_grad():
+            text_features = self.model.get_text_features(**inputs)
+        text_features = F.normalize(text_features, p=2, dim=-1)
+        return text_features
+    def encode_image(self, image):
+        batch_images = torch.stack([d["pixel_values"][0] for d in image])
+        batch_images = batch_images.to(self.device)
+        with torch.no_grad():
+            image_features = self.model.get_image_features(batch_images)
+        image_features = F.normalize(image_features, p=2, dim=-1)
+        return image_features
+    def embed_documents(self, texts):
+        text_features = self.encode_text(texts)
+        return text_features.cpu().numpy().tolist()
+    def embed_query(self, text):
+        text_features = self.encode_text([text])
+        return text_features.cpu().numpy().tolist()[0]
+    def embed_image(self, uris: List[str]):
+        # read image and transform
+        images = [Image.open(image_path) for image_path in uris]
+        transformed_images = [
+            self.transform(
+                image,
+                return_tensors="pt",
+            )
+            for image in images
+        ]
+        image_features = self.encode_image(transformed_images)
+        return image_features.cpu().numpy().tolist()
+if __name__ == "__main__":
+    model = CLIPModel("AI-ModelScope/chinese-clip-vit-large-patch14-336px")
+    model.embed_image(
+        [
+            "custom_eval/multimodal/images/AMNH.jpg",
+            "custom_eval/multimodal/images/AMNH.jpg",
+        ]
+    )
+    model.encode_text(["我喜欢吃饭" * 1000])
+    print("done")

evalscope/backend/rag_eval/utils/embedding.py ADDED Viewed

@@ -0,0 +1,183 @@
+import os
+import torch
+from typing import List, Optional, Union, Dict
+from sentence_transformers import models
+from sentence_transformers.SentenceTransformer import SentenceTransformer
+from sentence_transformers.cross_encoder import CrossEncoder
+from torch import Tensor
+from evalscope.backend.rag_eval.utils.tools import download_model
+from evalscope.utils.logger import get_logger
+from langchain_core.embeddings import Embeddings
+logger = get_logger()
+class BaseModel(Embeddings):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        max_seq_length: int = 512,
+        prompt: str = '',
+        revision: Optional[str] = None,
+        **kwargs,
+    ):
+        self.model_name_or_path = model_name_or_path
+        self.max_seq_length = max_seq_length
+        self.model_kwargs = kwargs.pop('model_kwargs', {})
+        self.model_kwargs['trust_remote_code'] = True
+        self.config_kwargs = kwargs.pop('config_kwargs', {})
+        self.config_kwargs['trust_remote_code'] = True
+        self.encode_kwargs = kwargs.pop('encode_kwargs', {})
+        self.encode_kwargs['convert_to_tensor'] = True
+        self.prompt = prompt
+        self.revision = revision
+    @property
+    def mteb_model_meta(self):
+        """Model metadata for MTEB (Multilingual Task Embeddings Benchmark)"""
+        from mteb import ModelMeta
+        return ModelMeta(
+            name=os.path.basename(self.model_name_or_path),
+            revision=self.revision,
+            languages=None,
+            release_date=None,
+        )
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed search docs. Compact langchain.
+        Args:
+            texts: List of text to embed.
+        Returns:
+            List of embeddings.
+        """
+        return self.encode_corpus(texts).tolist()
+    def embed_query(self, text: str) -> List[float]:
+        """Embed query text. Compact langchain.
+        Args:
+            text: Text to embed.
+        Returns:
+            Embedding.
+        """
+        return self.encode_queries(text).tolist()
+    def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
+        """Embed text."""
+        raise NotImplementedError
+    def encode_queries(self, queries: List[str], **kwargs) -> list[torch.Tensor]:
+        """Embed query text. Compact mteb."""
+        raise NotImplementedError
+    def encode_corpus(self, corpus: List[str] | List[Dict[str, str]], **kwargs) -> list[torch.Tensor]:
+        """Embed search docs . Compact mteb."""
+        raise NotImplementedError
+class SentenceTransformerModel(BaseModel):
+    def __init__(
+        self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs
+    ):
+        super().__init__(model_name_or_path, **kwargs)
+        if not pooling_mode:
+            self.model = SentenceTransformer(
+                self.model_name_or_path,
+                config_kwargs=self.config_kwargs,
+                model_kwargs=self.model_kwargs,
+            )
+        else:
+            word_embedding_model = models.Transformer(
+                self.model_name_or_path,
+                config_args=self.config_kwargs,
+                model_args=self.model_kwargs,
+            )
+            pooling_model = models.Pooling(
+                word_embedding_model.get_word_embedding_dimension(),
+                pooling_mode=pooling_mode,
+            )
+            self.model = SentenceTransformer(
+                modules=[word_embedding_model, pooling_model],
+            )
+        self.model.max_seq_length = self.max_seq_length
+    def encode(self, texts: Union[str, List[str]], prompt=None, **kwargs) -> List[torch.Tensor]:
+        kwargs.pop('prompt_name', '')  # remove prompt name, use prompt
+        self.encode_kwargs.update(kwargs)
+        embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
+        assert isinstance(embeddings, Tensor)
+        return embeddings.cpu().detach()
+    def encode_queries(self, queries, **kwargs):
+        return self.encode(queries, prompt=self.prompt)
+    def encode_corpus(self, corpus, **kwargs):
+        if isinstance(corpus[0], dict):
+            input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
+        else:
+            input_texts = corpus
+        return self.encode(input_texts)
+class CrossEncoderModel(BaseModel):
+    def __init__(self, model_name_or_path: str, **kwargs):
+        super().__init__(model_name_or_path, **kwargs)
+        self.model = CrossEncoder(
+            self.model_name_or_path,
+            trust_remote_code=True,
+            max_length=self.max_seq_length,
+        )
+    def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
+        self.encode_kwargs.update(kwargs)
+        if len(sentences[0]) == 3:  # Note: For mteb retrieval task
+            processed_sentences = []
+            for query, docs, instruction in sentences:
+                if isinstance(docs, dict):
+                    docs = docs['text']
+                processed_sentences.append((self.prompt + query, docs))
+            sentences = processed_sentences
+        embeddings = self.model.predict(sentences, **self.encode_kwargs)
+        assert isinstance(embeddings, Tensor)
+        return embeddings
+class EmbeddingModel:
+    """Custom embeddings"""
+    @staticmethod
+    def load(
+        model_name_or_path: str = '',
+        is_cross_encoder: bool = False,
+        hub: str = 'modelscope',
+        revision: Optional[str] = 'master',
+        **kwargs,
+    ):
+        # If model path does not exist and hub is 'modelscope', download the model
+        if not os.path.exists(model_name_or_path) and hub == 'modelscope':
+            model_name_or_path = download_model(model_name_or_path, revision)
+        # Return different model instances based on whether it is a cross-encoder and pooling mode
+        if is_cross_encoder:
+            return CrossEncoderModel(
+                model_name_or_path,
+                revision=revision,
+                **kwargs,
+            )
+        else:
+            return SentenceTransformerModel(
+                model_name_or_path,
+                revision=revision,
+                **kwargs,
+            )

evalscope/backend/rag_eval/utils/llm.py ADDED Viewed

@@ -0,0 +1,72 @@
+import os
+from typing import Any, Dict, Iterator, List, Mapping, Optional
+from modelscope.utils.hf_util import GenerationConfig
+from langchain_core.callbacks.manager import CallbackManagerForLLMRun
+from langchain_core.language_models.llms import LLM as BaseLLM
+from evalscope.models.model_adapter import ChatGenerationModelAdapter
+from langchain_openai import ChatOpenAI
+class LLM:
+    @staticmethod
+    def load(**kw):
+        api_base = kw.get('api_base', None)
+        if api_base:
+            return ChatOpenAI(
+                model_name=kw.get('model_name', ''),
+                openai_api_base=api_base,
+                openai_api_key=kw.get('api_key', 'EMPTY'),
+            )
+        else:
+            return LocalLLM(**kw)
+class LocalLLM(BaseLLM):
+    """A custom LLM that loads a model from a given path and performs inference."""
+    model_name_or_path: str
+    model_revision: str = 'master'
+    template_type: str = 'default'
+    model_name: Optional[str]
+    model: Optional[ChatGenerationModelAdapter]
+    generation_config: Optional[Dict]
+    def __init__(self, **kw):
+        super().__init__(**kw)
+        self.model_name = os.path.basename(self.model_name_or_path)
+        self.model = ChatGenerationModelAdapter(
+            model_id=self.model_name_or_path,
+            model_revision=self.model_revision,
+            template_type=self.template_type,
+            generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
+        )
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Run the LLM on the given input."""
+        infer_cfg = {'stop': stop}
+        response = self.model._model_generate(prompt, infer_cfg)
+        return response
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Return a dictionary of identifying parameters."""
+        return {
+            # The model name allows users to specify custom token counting
+            # rules in LLM monitoring applications (e.g., in LangSmith users
+            # can provide per token pricing for their model and monitor
+            # costs for the given LLM.)
+            'model_name': self.model_name,
+            'revision': self.model_revision,
+        }
+    @property
+    def _llm_type(self) -> str:
+        """Get the type of language model used by this chat model. Used for logging purposes only."""
+        return self.model_name

evalscope/backend/rag_eval/utils/tools.py ADDED Viewed

@@ -0,0 +1,63 @@
+import io
+import os
+import base64
+from modelscope import snapshot_download
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+def PIL_to_bytes(image_format, **kwargs):
+    OPTIONS = {
+        "webp": dict(format="webp", lossless=True),
+        "png": dict(format="png"),
+        "jpg": dict(format="jpeg"),
+    }
+    def transform(image):
+        bytestream = io.BytesIO()
+        image.save(bytestream, **OPTIONS[image_format])
+        return bytestream.getvalue()
+    return transform
+def PIL_to_base64(image, **kwargs):
+    bytestream = io.BytesIO()
+    image.save(bytestream, format="jpeg")
+    return base64.b64encode(bytestream.getvalue()).decode("utf-8")
+def path_to_bytes(filepath):
+    with open(filepath, "rb") as fp:
+        return fp.read()
+def path_to_base64(filepath):
+    file_content = path_to_bytes(filepath)
+    return base64.b64encode(file_content).decode("utf-8")
+def ensure_dir(file_path):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+def save_to_jsonl(df, file_path):
+    ensure_dir(file_path)
+    df.to_json(file_path, orient="records", lines=True, force_ascii=False)
+def save_to_tsv(df, file_path):
+    ensure_dir(file_path)
+    df.to_csv(file_path, sep="\t", index=False)
+def download_model(model_id: str, revision: str):
+    """
+    default base dir: '~/.cache/modelscope/hub/model_id'
+    """
+    logger.info(f"Loading model {model_id} from modelscope")
+    model_path = snapshot_download(model_id=model_id, revision=revision)
+    return model_path

evalscope/backend/vlm_eval_kit/backend_manager.py CHANGED Viewed

@@ -1,10 +1,11 @@
+import copy
+import subprocess
+from functools import partial
 from typing import Optional, Union
-from evalscope.utils import is_module_installed, get_valid_list
 from evalscope.backend.base import BackendManager
+from evalscope.utils import get_valid_list, is_module_installed
 from evalscope.utils.logger import get_logger
-from functools import partial
-import subprocess
-import copy
 logger = get_logger()
@@ -19,6 +20,7 @@ class ExecutionMode:
 class VLMEvalKitBackendManager(BackendManager):
     def __init__(self, config: Union[str, dict], **kwargs):
         """BackendManager for VLM Evaluation Kit
@@ -36,7 +38,6 @@ class VLMEvalKitBackendManager(BackendManager):
         self._check_valid()
     def _check_valid(self):
         # Ensure not both model and datasets are empty
         if not self.args.data or not self.args.model:
@@ -45,15 +46,15 @@ class VLMEvalKitBackendManager(BackendManager):
         # Check datasets
         valid_datasets, invalid_datasets = get_valid_list(self.args.data, self.valid_datasets)
         if len(invalid_datasets) != 0:
-            logger.warning(f"Using custom dataset: {invalid_datasets}, ")
+            logger.warning(f'Using custom dataset: {invalid_datasets}, ')
         # Check model
         if isinstance(self.args.model[0], dict):
             model_names = [model['name'] for model in self.args.model]
             valid_model_names, invalid_model_names = get_valid_list(model_names, self.valid_model_names)
             assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
                 f'refer to the following list to get proper model name: {self.valid_model_names}'
             # set model_cfg
             new_model_names = []
             for model_cfg in self.args.model:
@@ -62,19 +63,15 @@ class VLMEvalKitBackendManager(BackendManager):
                 if model_name == 'CustomAPIModel':
                     model_type = model_cfg['type']
                     remain_cfg = copy.deepcopy(model_cfg)
-                    del remain_cfg['name'] # remove not used args
-                    del remain_cfg['type'] # remove not used args
-                    self.valid_models.update({
-                                model_type: partial(model_class,
-                                                   model=model_type,
-                                                   **remain_cfg)
-                                })
+                    del remain_cfg['name']  # remove not used args
+                    del remain_cfg['type']  # remove not used args
+                    self.valid_models.update({model_type: partial(model_class, model=model_type, **remain_cfg)})
                     new_model_names.append(model_type)
                 else:
                     remain_cfg = copy.deepcopy(model_cfg)
-                    del remain_cfg['name'] # remove not used args
+                    del remain_cfg['name']  # remove not used args
                     self.valid_models[model_name] = partial(model_class, **remain_cfg)
                     new_model_names.append(model_name)
@@ -83,7 +80,7 @@ class VLMEvalKitBackendManager(BackendManager):
         elif isinstance(self.args.model[0], str):
             valid_model_names, invalid_model_names = get_valid_list(self.args.model, self.valid_model_names)
             if len(invalid_datasets) != 0:
-                logger.warning(f"Using custom dataset: {invalid_datasets}, ")
+                logger.warning(f'Using custom dataset: {invalid_datasets}, ')
     @property
     def cmd(self):
@@ -127,7 +124,7 @@ class VLMEvalKitBackendManager(BackendManager):
             f'--data {" ".join(self.args.data)} ' \
             f'{self.get_restore_arg("verbose", self.args.verbose)} ' \
             f'{self.get_restore_arg("ignore", self.args.ignore)} ' \
-            f'{self.get_restore_arg("rerun", self.args.rerun)} ' \
+            f'{self.get_restore_arg("reuse", self.args.reuse)} ' \
             f'{self.get_arg_with_default("work-dir", self.args.work_dir)} ' \
             f'{self.get_arg_with_default("limit", self.args.limit)} ' \
             f'{self.get_arg_with_default("mode", self.args.mode)} ' \
@@ -141,7 +138,12 @@ class VLMEvalKitBackendManager(BackendManager):
         if run_mode == ExecutionMode.CMD:
             logger.info(f'** Run command: {self.cmd}')
             try:
-                subprocess.run(self.cmd, check=True, ext=True, shell=True,)
+                subprocess.run(
+                    self.cmd,
+                    check=True,
+                    ext=True,
+                    shell=True,
+                )
             except subprocess.CalledProcessError as e:
                 logger.error(f'** Run command failed: {e.stderr}')
                 raise

evalscope/benchmarks/ceval/samples.jsonl ADDED Viewed

@@ -0,0 +1 @@

+ {'id': 0, 'question': '下列关于税法基本原则的表述中，不正确的是____。', 'A': '税收法定原则包括税收要件法定原则和税务合法性原则', 'B': '税收公平原则源于法律上的平等性原则', 'C': '税收效率原则包含经济效率和行政效率两个方面', 'D': '税务机关按法定程序依法征税，可以自由做出减征、停征或免征税款的决定', 'answer': 'D', 'explanation': ''}

evalscope/benchmarks/cmmlu/samples.jsonl ADDED Viewed

@@ -0,0 +1,5 @@
+{'input': '毛毛骑在牛背上过河，他共有甲、乙、丙、丁4头牛，甲过河要20分钟，乙过河要30分钟，丙过河要40分钟，丁过河要50分钟。毛毛每次只能赶2头牛过河，要把4头牛都赶到对岸去，最少要多少分钟?', 'A': '190', 'B': '180', 'C': '170', 'D': '160', 'target': 'D'}
+{'input': '下列关于重力的说法正确的是', 'A': '在地球周围的物体都要受到重力作用，与其运动状态无关', 'B': '对某一物体而言，重力的大小是一个恒量，不随物体的地理位置而改变', 'C': '重力就是地球对物体的吸引力，重力的方向总是竖直向下', 'D': '在地球表面各处的重力方向都是相同的', 'target': 'A'}
+{'input': '心脏的静脉血回心的主要途径是', 'A': '心小静脉', 'B': '冠状窦', 'C': '心中静脉', 'D': '心前静脉', 'target': 'B'}
+{'input': "以西蒙为代表的决策理论学派提出的决策准则是", 'A': '最优化', 'B': '公平', 'C': '民主化', 'D': '满意', 'target': 'D'}
+{'input': '20世纪初，英国首相阿斯奎斯说：“我们现在有一个牢固确立了两百年的传统，即归根到底，王位的占有者接受其大臣的建议并据此行事。”这一传统的确立，使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家，成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}

evalscope/benchmarks/mmlu/samples.jsonl ADDED Viewed

@@ -0,0 +1,5 @@
+{'input': 'A "dished face" profile is often associated with', 'A': 'a protruding mandible due to reactivation of the condylar cartilage by acromegaly.', 'B': 'a recessive maxilla due to failure of elongation of the cranial base.', 'C': 'an enlarged frontal bone due to hydrocephaly.', 'D': 'defective development of the maxillary air sinus.', 'target': 'B'}
+{'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.', 'A': 'Service quality.', 'B': 'Service action.', 'C': 'Service recovery.', 'D': 'Service satisfaction.', 'target': 'A'}
+{'input': ' Information collected for the first time specifically for a marketing research study is called:', 'A': 'Secondary research.', 'B': 'Primary research.', 'C': 'Soft research.', 'D': 'Experimental research.', 'target': 'B'}
+{'input': "This includes advertisements that contain 'call-to-response' mechanisms such as telephone numbers, website addresses, email and postal addresses:", 'A': 'Direct response advertising.', 'B': 'Sales promotions.', 'C': 'Mass media advertising.', 'D': 'Public relations.', 'target': 'A'}
+{'input': 'Which of the following is not part of the external marketing environment?', 'A': 'Political.', 'B': 'Legal.', 'C': 'Product.', 'D': 'Socio-cultural.', 'target': 'C'}

evalscope/benchmarks/race/samples.jsonl ADDED Viewed

@@ -0,0 +1,5 @@
+{'example_id': 'middle4227.txt', 'article': 'There are many kinds...ealthy.\n,.', 'answer': 'D', 'question': 'We may read this pas... in   _  .', 'options': ['a letter', 'a story', 'a newspaper', 'a health magazine']}
+{'example_id': 'middle3329.txt', 'article': 'Do you know why diff...ng at all.', 'answer': 'B', 'question': 'Those pests with dif...of danger.', 'options': ['change their colours', 'hide in the day time...r at night', 'move quietly', 'hide at night and ap...e day time']}
+{'example_id': 'middle3614.txt', 'article': 'The seahorse is a ve...o the sea.', 'answer': 'B', 'question': 'A seahorse eats   _  .', 'options': ['sea weed', 'small fish', 'water', 'nothing']}
+{'example_id': 'middle6632.txt', 'article': 'Kids have unbelievab...h at her."', 'answer': 'D', 'question': 'Which is NOT mention...e passage?', 'options': ['Robots keep secrets.', 'Robots give suggestions.', 'Robots do chores.', 'Robots make movies.']}
+{'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}

evalscope/benchmarks/trivia_qa/samples.jsonl ADDED Viewed

@@ -0,0 +1,5 @@
+{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who was the man behind The Chipmunks?"}], "ideal": ["David Seville", "david seville"]}
+{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Which Lloyd Webber musical premiered in the US on 10th December 1993?"}], "ideal": ["Sunset Blvd", "West Sunset Boulevard", "Sunset Boulevard", "Sunset Bulevard", "Sunset Blvd.", "sunset boulevard", "sunset bulevard", "west sunset boulevard", "sunset blvd"]}
+{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who was the next British Prime Minister after Arthur Balfour?"}], "ideal": ["Sir Henry Campbell-Bannerman", "Campbell-Bannerman", "Campbell Bannerman", "Sir Henry Campbell Bannerman", "Henry Campbell Bannerman", "Henry Campbell-Bannerman", "henry campbell bannerman", "sir henry campbell bannerman", "campbell bannerman"]}
+{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who had a 70s No 1 hit with Kiss You All Over?"}], "ideal": ["Internal exile", "Exiles", "Transported for life", "Exile (politics and government)", "Voluntary exile", "Sent into exile", "Exile and Banishment", "Self-exile", "Forced exile", "Exile", "Exile in Greek tragedy", "Banish", "Banishment", "exiles", "voluntary exile", "forced exile", "banish", "self exile", "exile politics and government", "exile in greek tragedy", "sent into exile", "banishment", "transported for life", "exile", "internal exile", "exile and banishment"]}
+{"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "What claimed the life of singer Kathleen Ferrier?"}], "ideal": ["Cancer pathology", "Deaths by cancer", "Anti-cancer", "Cancer (disease)", "Cancerophobia", "Malignant lesion", "Cancer medication", "Malignant tumors", "Cancer signs", "Malignant neoplasm", "Invasive (cancer)", "Malignant Neoplasms", "Malignant growth", "Sporadic cancer", "Malignant cancer", "Tumour virus", "Cancer en cuirasse", "Microtumor", "Malignant neoplasms", "Malignant tumour", "Carcinophobia", "Malignacy", "Cancer patient", "Epithelial cancers", "Solid cancer", "Cancers", "Tumor medication", "Malignant neoplastic disease", "AIDS-related cancer", "Invasive cancer", "Cancer therapy", "Cancerous tumor", "Cancer", "Financial toxicity", "Cancer diagnosis", "Cancer (medicine)", "Malignant tumor", "Cancerous", "Borderline (cancer)", "Signs of cancer", "Malignancies", "Cancer aromatase", "aids related cancer", "sporadic cancer", "cancer disease", "malignant tumors", "cancers", "carcinophobia", "cancer", "cancer diagnosis", "malignant neoplastic disease", "malignant neoplasm", "tumour virus", "cancer medicine", "deaths by cancer", "malignant tumour", "epithelial cancers", "solid cancer", "cancerous", "borderline cancer", "invasive cancer", "anti cancer", "cancer pathology", "cancer signs", "cancer aromatase", "cancer therapy", "financial toxicity", "cancerophobia", "cancer en cuirasse", "cancer patient", "cancerous tumor", "malignant cancer", "malignant neoplasms", "tumor medication", "signs of cancer", "malignacy", "malignant tumor", "cancer medication", "microtumor", "malignancies", "malignant lesion", "malignant growth"]}

evalscope/cli/start_perf.py CHANGED Viewed

@@ -1,20 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from abc import abstractmethod
-import os, sys, time
+import os
 from argparse import ArgumentParser
-import subprocess
 from evalscope.cli.base import CLICommand
-from evalscope.perf.http_client import add_argument, run_perf_benchmark
+from evalscope.perf.arguments import add_argument
+from evalscope.perf.main import run_perf_benchmark
 current_path = os.path.dirname(os.path.abspath(__file__))
 root_path = os.path.dirname(current_path)
 def subparser_func(args):
     """ Function which will be called for a specific sub parser.
     """
     return PerfBenchCMD(args)
 class PerfBenchCMD(CLICommand):
     name = 'perf'
@@ -28,10 +29,6 @@ class PerfBenchCMD(CLICommand):
         parser = parsers.add_parser(PerfBenchCMD.name)
         add_argument(parser)
         parser.set_defaults(func=subparser_func)
     def execute(self):
         run_perf_benchmark(self.args)

evalscope/metrics/bundled_rouge_score/rouge_scorer.py CHANGED Viewed

@@ -51,7 +51,7 @@ try:
     punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
     if not os.path.exists(punkt_path):
-        os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
+        os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
         os.system(f'unzip {punkt_path} -d {nltk_dir}')
     else:
         logger.info(f'{punkt_path} already exists, skipping download')

evalscope 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl

evalscope 0.6.0rc0py3-none-any.whl → 0.7.0py3-none-any.whl