PyPI - evalscope - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

evalscope 0.6.0py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

evalscope/backend/rag_eval/utils/llm.py ADDED Viewed

@@ -0,0 +1,72 @@
+import os
+from typing import Any, Dict, Iterator, List, Mapping, Optional
+from modelscope.utils.hf_util import GenerationConfig
+from langchain_core.callbacks.manager import CallbackManagerForLLMRun
+from langchain_core.language_models.llms import LLM as BaseLLM
+from evalscope.models.model_adapter import ChatGenerationModelAdapter
+from langchain_openai import ChatOpenAI
+class LLM:
+    @staticmethod
+    def load(**kw):
+        api_base = kw.get('api_base', None)
+        if api_base:
+            return ChatOpenAI(
+                model_name=kw.get('model_name', ''),
+                openai_api_base=api_base,
+                openai_api_key=kw.get('api_key', 'EMPTY'),
+            )
+        else:
+            return LocalLLM(**kw)
+class LocalLLM(BaseLLM):
+    """A custom LLM that loads a model from a given path and performs inference."""
+    model_name_or_path: str
+    model_revision: str = 'master'
+    template_type: str = 'default'
+    model_name: Optional[str]
+    model: Optional[ChatGenerationModelAdapter]
+    generation_config: Optional[Dict]
+    def __init__(self, **kw):
+        super().__init__(**kw)
+        self.model_name = os.path.basename(self.model_name_or_path)
+        self.model = ChatGenerationModelAdapter(
+            model_id=self.model_name_or_path,
+            model_revision=self.model_revision,
+            template_type=self.template_type,
+            generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
+        )
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Run the LLM on the given input."""
+        infer_cfg = {'stop': stop}
+        response = self.model._model_generate(prompt, infer_cfg)
+        return response
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Return a dictionary of identifying parameters."""
+        return {
+            # The model name allows users to specify custom token counting
+            # rules in LLM monitoring applications (e.g., in LangSmith users
+            # can provide per token pricing for their model and monitor
+            # costs for the given LLM.)
+            'model_name': self.model_name,
+            'revision': self.model_revision,
+        }
+    @property
+    def _llm_type(self) -> str:
+        """Get the type of language model used by this chat model. Used for logging purposes only."""
+        return self.model_name

evalscope/backend/rag_eval/utils/tools.py ADDED Viewed

@@ -0,0 +1,63 @@
+import io
+import os
+import base64
+from modelscope import snapshot_download
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+def PIL_to_bytes(image_format, **kwargs):
+    OPTIONS = {
+        "webp": dict(format="webp", lossless=True),
+        "png": dict(format="png"),
+        "jpg": dict(format="jpeg"),
+    }
+    def transform(image):
+        bytestream = io.BytesIO()
+        image.save(bytestream, **OPTIONS[image_format])
+        return bytestream.getvalue()
+    return transform
+def PIL_to_base64(image, **kwargs):
+    bytestream = io.BytesIO()
+    image.save(bytestream, format="jpeg")
+    return base64.b64encode(bytestream.getvalue()).decode("utf-8")
+def path_to_bytes(filepath):
+    with open(filepath, "rb") as fp:
+        return fp.read()
+def path_to_base64(filepath):
+    file_content = path_to_bytes(filepath)
+    return base64.b64encode(file_content).decode("utf-8")
+def ensure_dir(file_path):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+def save_to_jsonl(df, file_path):
+    ensure_dir(file_path)
+    df.to_json(file_path, orient="records", lines=True, force_ascii=False)
+def save_to_tsv(df, file_path):
+    ensure_dir(file_path)
+    df.to_csv(file_path, sep="\t", index=False)
+def download_model(model_id: str, revision: str):
+    """
+    default base dir: '~/.cache/modelscope/hub/model_id'
+    """
+    logger.info(f"Loading model {model_id} from modelscope")
+    model_path = snapshot_download(model_id=model_id, revision=revision)
+    return model_path

evalscope/metrics/bundled_rouge_score/rouge_scorer.py CHANGED Viewed

@@ -51,7 +51,7 @@ try:
     punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
     if not os.path.exists(punkt_path):
-        os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
+        os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
         os.system(f'unzip {punkt_path} -d {nltk_dir}')
     else:
         logger.info(f'{punkt_path} already exists, skipping download')

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = "0.6.0"
-__release_datetime__ = "2024-11-08 11:59:59"
+__version__ = "0.6.1"
+__release_datetime__ = "2024-11-22 15:00:00"

{evalscope-0.6.0.dist-info → evalscope-0.6.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.6.0
+Version: 0.6.1
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -28,7 +28,7 @@ Requires-Dist: nltk>=3.9
 Requires-Dist: openai
 Requires-Dist: pandas
 Requires-Dist: plotly
-Requires-Dist: pyarrow
+Requires-Dist: pyarrow<=17.0.0
 Requires-Dist: pympler
 Requires-Dist: pyyaml
 Requires-Dist: regex
@@ -61,7 +61,7 @@ Requires-Dist: nltk>=3.9; extra == "all"
 Requires-Dist: openai; extra == "all"
 Requires-Dist: pandas; extra == "all"
 Requires-Dist: plotly; extra == "all"
-Requires-Dist: pyarrow; extra == "all"
+Requires-Dist: pyarrow<=17.0.0; extra == "all"
 Requires-Dist: pympler; extra == "all"
 Requires-Dist: pyyaml; extra == "all"
 Requires-Dist: regex; extra == "all"
@@ -80,10 +80,10 @@ Requires-Dist: transformers>=4.33; extra == "all"
 Requires-Dist: transformers-stream-generator; extra == "all"
 Requires-Dist: jieba; extra == "all"
 Requires-Dist: rouge-chinese; extra == "all"
-Requires-Dist: ms-opencompass>=0.1.1; extra == "all"
+Requires-Dist: ms-opencompass>=0.1.3; extra == "all"
 Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
-Requires-Dist: mteb>=0.14.16; extra == "all"
-Requires-Dist: ragas<0.3,>=0.2.3; extra == "all"
+Requires-Dist: mteb==1.19.4; extra == "all"
+Requires-Dist: ragas==0.2.5; extra == "all"
 Requires-Dist: webdataset>0.2.0; extra == "all"
 Provides-Extra: inner
 Requires-Dist: absl-py; extra == "inner"
@@ -112,10 +112,10 @@ Requires-Dist: tqdm; extra == "inner"
 Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
 Requires-Dist: transformers-stream-generator; extra == "inner"
 Provides-Extra: opencompass
-Requires-Dist: ms-opencompass>=0.1.1; extra == "opencompass"
+Requires-Dist: ms-opencompass>=0.1.3; extra == "opencompass"
 Provides-Extra: rag
-Requires-Dist: mteb>=0.14.16; extra == "rag"
-Requires-Dist: ragas<0.3,>=0.2.3; extra == "rag"
+Requires-Dist: mteb==1.19.4; extra == "rag"
+Requires-Dist: ragas==0.2.5; extra == "rag"
 Requires-Dist: webdataset>0.2.0; extra == "rag"
 Provides-Extra: vlmeval
 Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
@@ -139,6 +139,7 @@ Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
  <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
 <p>
+> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
 ## 📋 Table of Contents
 - [Introduction](#introduction)
@@ -164,7 +165,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
 The architecture includes the following modules:
 1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
 2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
-3. **Evaluation Backend**:
+3. **Evaluation Backend**:
     - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
     - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
     - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
@@ -251,7 +252,7 @@ You can execute this command from any directory:
 python -m evalscope.run \
  --model qwen/Qwen2-0.5B-Instruct \
  --template-type qwen \
- --datasets arc
+ --datasets arc
 ```
 #### Install from source
@@ -358,13 +359,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
 EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
 ## Offline Evaluation
-You can use local dataset to evaluate the model without internet connection.
+You can use local dataset to evaluate the model without internet connection.
 Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
 ## Arena Mode
-The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
+The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
 Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)

{evalscope-0.6.0.dist-info → evalscope-0.6.1.dist-info}/RECORD RENAMED Viewed

@@ -6,7 +6,7 @@ evalscope/run.py,sha256=uAXtaxIBcR94jyfHGFAecuzn0y71oLgu-d9VOohCJAw,18738
 evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
 evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
 evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
-evalscope/version.py,sha256=qBj1Vp3OK938TMBuLABEe0jlvU7cn_lF3XQJR7KzsFQ,118
+evalscope/version.py,sha256=o4SLhBjhMLzVbUK1flGxf-kiqIBLnLnJbxG06BmvkyU,118
 evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
 evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -14,7 +14,7 @@ evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG
 evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZcngqD8rxvI5EijLUh_zI,10294
 evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
 evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
-evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=uumSNtiEra1e4IskEnfFawB8aGCkFrPaW63aWhKahno,5243
+evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=3V67A2LSj_XaiGd9fqdKpxpzyNrfynCH3UnhaBtAaqc,5326
 evalscope/backend/rag_eval/__init__.py,sha256=8om6TVnTMmyTEQt1jBuUQA4UfIzyps-_-ih90H_Qjio,284
 evalscope/backend/rag_eval/backend_manager.py,sha256=jmO-UMu6_iOXMnl4--PrMWCsnIYEhsbiX017rtURqm0,2997
 evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=gDXCiRUTSeGQHxd5SjQsnphMqHJ2si2jywRiHvujEOg,150
@@ -30,10 +30,10 @@ evalscope/backend/rag_eval/cmteb/arguments.py,sha256=wZvnVir2tSxYCV_DPR3TSDj4Vxt
 evalscope/backend/rag_eval/cmteb/base.py,sha256=fYrIjKwOLwBAHb2rlNkEjYScjZ5Qpyv2LdMmWZYWREA,2830
 evalscope/backend/rag_eval/cmteb/task_template.py,sha256=Clyc8TZCtZrL6MjAw49rh55Xb3hf2y1C3SzLvZsorLE,2646
 evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=7adR40W6Uu58-QR9jCUP4k7TdAnG0oT225v4xHXah2g,10635
-evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=7j1Hts_r4Nv8DlbIiPFMaU1JDxCYgu0wO0JI8T_Y6X8,8969
+evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=-oJ9rXy7pgOB7Gyf68TcSlmmAUoBx5hKofcKNuIsCd8,8977
 evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=rF6dtrwOfvJoq2Y4myZg9_638M1g06qq0hWCmvxsIo0,2039
 evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=2WkaTE-jF8jqsu1UcNDqN8A4567UzW5boD_0B83j-9A,4008
-evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=0_8SZBvAlZDxQq1tMFO7n1A8OcW77a6MphvlUkzN9Qs,5322
+evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=C34nDuya8OT3aeMxYCYjUpUtWp7w00jSfIYQSInlNAg,5329
 evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=wUxiQH5aOmWNS4YswACyHqBn5xqP5eyvsq6U9WSp5R0,11457
 evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=6GMaoCANM-IKYLk4srHOYr_eurav3DGihHMQeJPXR6k,12054
 evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=eBHm_TWeh7WiwpdVBtUlegeXMAxJyVQdUHRhJERobIs,1506
@@ -44,8 +44,13 @@ evalscope/backend/rag_eval/ragas/metrics/__init__.py,sha256=HgY5nrcNtWpQ7gBi5lCE
 evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py,sha256=Uqz5qWZ76Gos95_QlhwncbATXyk0YX4wkI0LiAdPElU,3838
 evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py,sha256=CdLnWHq1eTna6j3F5-pncW5YusxD_v3ScjzeCsZ7mng,3967
 evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=WO2xja0g0JSiYGdu2uAEDQgDceuFcgPWwPoqFnwDU0s,172
-evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=In-2VvZJIZvXl9idGUUQBTb7Gu-o1yFLjaqj-eJkWw0,8437
+evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=nX-dG0Fm1629pSASujuEmMODFZf1955WncNNykRrNtI,9305
 evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=bXOqik6qKWzbrEz21ykdkqeqqPrmoUIhTwW6eRQXy0M,2222
+evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+evalscope/backend/rag_eval/utils/clip.py,sha256=frafvJ1soUtjFUmi-053_Fhg6ERRwyvczQBlLWAX9vE,5104
+evalscope/backend/rag_eval/utils/embedding.py,sha256=RZf0JlovZY_cCBsq8MMUqC_Sy78WtKLY_rBAlRA_udo,6239
+evalscope/backend/rag_eval/utils/llm.py,sha256=9tFwMNoTf3jNomgDu5qqVLO92HtEtelH3DXpny9_B2g,2552
+evalscope/backend/rag_eval/utils/tools.py,sha256=LpcYoeIBj1btzQ1_P84u1dYCdRWhMtiltxihmZCvWKk,1528
 evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
 evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ewhpE9yzsqf5ED6kqsqek2YEgg96GBQOupxtVNhaXxI,6046
 evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=Yz2A5kB1E8DYBnjuVCA6TTPtLjhg8vYKeJTh6FU_Ecw,1645
@@ -132,7 +137,7 @@ evalscope/metrics/math_accuracy.py,sha256=1PCy1VUNYg48JcGy-6SUmUDZNwPeAkMW1QQ_lX
 evalscope/metrics/metrics.py,sha256=sDZljGiZwgHsFZ5eNi65-3z3BLCdIwWUzPcq2QpKf1k,12545
 evalscope/metrics/rouge_metric.py,sha256=sN0r-sXXc-nJUdFrthQPAv1VFdOCrF6zzIYDKaLSgrU,4522
 evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
-evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=vhzIMSQezhZuJzGndymWjB_iRbDdECoEidOIdNL3NAM,12213
+evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=MXcHwmsXnh9mQZR1Bt5St6DNwXY-mfz4dNM8y6a23dc,12236
 evalscope/models/__init__.py,sha256=zG27J2HSeKPGiAIUE7QLPHEPLyXLsfaDwYI_TDXjpCg,145
 evalscope/models/dummy_chat_model.py,sha256=xE8wcFVSCkvizEJ-B8ojX0Ir01Q5KrN5mapjMQaQtbg,1325
 evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
@@ -204,8 +209,8 @@ evalscope/utils/logger.py,sha256=cf3U400Mx1speMMNXorjwEE8noDz5Mbd-9PNgaulGeY,301
 evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
 evalscope/utils/task_utils.py,sha256=IMtBSBUp3H95Ko0vn8Q55Wmz2SFZXSfjVy49tyomL_g,537
 evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
-evalscope-0.6.0.dist-info/METADATA,sha256=l7TbIWD779n07PrMMg7g0_SuPJks2BUZxR4GJzRK9Ok,21096
-evalscope-0.6.0.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-evalscope-0.6.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
-evalscope-0.6.0.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
-evalscope-0.6.0.dist-info/RECORD,,
+evalscope-0.6.1.dist-info/METADATA,sha256=n4CpTzJGnhgqEsfbL1UfZtXHULmeNCGnKChyi6eT8Fw,21237
+evalscope-0.6.1.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
+evalscope-0.6.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
+evalscope-0.6.1.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
+evalscope-0.6.1.dist-info/RECORD,,

{evalscope-0.6.0.dist-info → evalscope-0.6.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.44.0)
+Generator: bdist_wheel (0.45.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{evalscope-0.6.0.dist-info → evalscope-0.6.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.6.0.dist-info → evalscope-0.6.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

evalscope 0.6.0py3-none-any.whl → 0.6.1py3-none-any.whl