PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +6 -2
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +47 -51
evalscope/backend/rag_eval/utils/embedding.py +13 -12
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +33 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +154 -96
evalscope/constants.py +50 -32
evalscope/evaluator/evaluator.py +97 -377
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +16 -3
evalscope/perf/benchmark.py +9 -11
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +8 -1
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +5 -6
evalscope/perf/utils/db_util.py +77 -30
evalscope/perf/utils/local_server.py +21 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +153 -381
evalscope/run_arena.py +21 -25
evalscope/summarizer.py +27 -40
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -27
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -4
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
evalscope/tools/combine_reports.py +27 -34
evalscope/tools/rewrite_eval_results.py +15 -47
evalscope/utils/__init__.py +1 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +4 -5
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/io_utils.py +162 -0
evalscope/utils/logger.py +17 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +5 -306
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
evalscope-0.8.1.dist-info/RECORD +285 -0
tests/cli/test_run.py +53 -15
tests/perf/test_perf.py +6 -1
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0

evalscope/models/api/openai_api.py CHANGED Viewed

@@ -1,34 +1,36 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import json
+import requests
 import threading
 import time
 from asyncio import Queue
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+from typing import Dict, List, Optional, Union
-import requests
-from typing import Union, List, Optional, Dict
-from concurrent.futures import ThreadPoolExecutor
-from modelscope.utils.logger import get_logger
+from evalscope.utils.logger import get_logger
 logger = get_logger()
 class OpenaiApi:
-    def __init__(self,
-                 model: str,
-                 openai_api_key,
-                 openai_api_base,
-                 logprobs: Optional[bool] = False,
-                 top_logprobs: Optional[int] = None,
-                 max_new_tokens: int = 4096,
-                 temperature: Optional[float] = 0.0,
-                 repetition_penalty: Optional[float] = 1.0,
-                 is_chat: bool = True,
-                 verbose: bool = True,
-                 retry: int = 3,
-                 query_per_second: int = 10,     # TODO
-                 **kwargs):
+    def __init__(
+            self,
+            model: str,
+            openai_api_key,
+            openai_api_base,
+            logprobs: Optional[bool] = False,
+            top_logprobs: Optional[int] = None,
+            max_new_tokens: int = 4096,
+            temperature: Optional[float] = 0.0,
+            repetition_penalty: Optional[float] = 1.0,
+            is_chat: bool = True,
+            verbose: bool = True,
+            retry: int = 3,
+            query_per_second: int = 10,  # TODO
+            **kwargs):
         self.temperature = temperature
         self.repetition_penalty = repetition_penalty
@@ -45,14 +47,17 @@ class OpenaiApi:
         self.token_bucket = TokenBucket(query_per_second, verbose)
-    def generate_simple(self, inputs: Union[List[str]]):
+    def generate_simple(self, inputs: Union[List[str]], num_proc: int = 8):
         def process_one(in_data: str):
             if self.is_chat:
                 data = dict(
                     model=self.model,
-                    messages=[{'role': 'user', 'content': in_data}],
+                    messages=[{
+                        'role': 'user',
+                        'content': in_data
+                    }],
                     max_tokens=self.max_tokens,
                     n=1,
                     logprobs=self.logprobs,
@@ -72,7 +77,10 @@ class OpenaiApi:
             # todo
             openai_api_key = self.openai_api_key or ''
-            header = {'Authorization': f'Bearer ', 'content-type': 'application/json', }
+            header = {
+                'Authorization': f'Bearer {openai_api_key}',
+                'content-type': 'application/json',
+            }
             data = json.dumps(data, ensure_ascii=False)
             if self.verbose:
@@ -91,14 +99,18 @@ class OpenaiApi:
                 else:
                     return resp['choices'][0]['text'].strip()
-        with ThreadPoolExecutor() as executor:
-            results = list(executor.map(process_one, inputs))
+        results = []
+        with ThreadPoolExecutor(max_workers=num_proc) as executor:
+            # Submit all tasks
+            future_to_task = {executor.submit(process_one, input_one): input_one for input_one in inputs}
+            # Show progress bar
+            for future in tqdm(as_completed(future_to_task), total=len(inputs)):
+                results.append(future.result())
         return results
-    def generate(self,
-                 inputs: Union[List[str], List[List]],
-                 **kwargs) -> List[str]:
+    def generate(self, inputs: Union[List[str], List[List]], **kwargs) -> List[str]:
         """
         Generate responses from OpenAI API.
@@ -160,13 +172,12 @@ class OpenaiApi:
                 def remove_none_val(input_d: dict):
                     return {k: v for k, v in input_d.items() if v is not None}
                 data = remove_none_val(data)
                 if self.verbose:
                     logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
-                raw_response = requests.post(self.url,
-                                             headers=header,
-                                             data=json.dumps(data, ensure_ascii=False))
+                raw_response = requests.post(self.url, headers=header, data=json.dumps(data, ensure_ascii=False))
                 response = raw_response.json()
                 if self.verbose:

evalscope/models/custom/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from evalscope.models.custom.custom_model import *

evalscope/models/custom/custom_model.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from abc import ABC, abstractmethod
-from typing import Any, Union, Dict, List
 import torch
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Union
 class CustomModel(ABC):
@@ -11,7 +11,7 @@ class CustomModel(ABC):
         self.kwargs = kwargs
         if config.get('model_id', None) is None:
-            raise ValueError(f"**Error: model_id is required in config for CustomModel. Got config: {config}")
+            raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
     @abstractmethod
     @torch.no_grad()

evalscope/models/dummy_chat_model.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import random
 import time
 from evalscope.models import ChatBaseModel
 from evalscope.utils.logger import get_logger
@@ -32,15 +33,13 @@ class DummyChatModel(ChatBaseModel):
         # Build response
         res = {
-            'choices': [
-                {
-                    'index': 0,
-                    'message': {
-                        'content': choice,
-                        'role': 'assistant'
-                    }
+            'choices': [{
+                'index': 0,
+                'message': {
+                    'content': choice,
+                    'role': 'assistant'
                 }
-            ],
+            }],
             'created': time.time(),
             'model': self.MODEL_ID + '-' + self.REVISION,
             'object': 'chat.completion',

evalscope/models/model_adapter.py CHANGED Viewed

@@ -1,35 +1,25 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) EleutherAI, Inc. and its affiliates.
 # flake8: noqa
+import numpy as np
 import os
 import sys
-from typing import List, Any, Union, Dict
-import numpy as np
 import time
+import torch
 from abc import ABC, abstractmethod
 from copy import deepcopy
-import torch
+from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 from torch import dtype
+from typing import Any, Dict, List, Union
-from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
+from evalscope.constants import DEFAULT_MODEL_CACHE_DIR
 from evalscope.models.custom import CustomModel
-from evalscope.models.template import get_template, StopWordsCriteria
+from evalscope.utils.chat_service import ChatMessage
 from evalscope.utils.logger import get_logger
-from transformers import StoppingCriteriaList
+from evalscope.utils.model_utils import fix_do_sample_warning
 logger = get_logger()
-# Notes:
-# - modelscope>=1.9.5
-def get_model_cache_dir(root_cache_dir: str):
-    model_cache_dir = os.path.join(root_cache_dir, 'models')
-    model_cache_dir = os.path.expanduser(model_cache_dir)
-    os.makedirs(model_cache_dir, exist_ok=True)
-    return model_cache_dir
 class BaseModelAdapter(ABC):
     """
@@ -69,7 +59,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
                  torch_dtype: dtype = torch.bfloat16,
                  model_revision: str = None,
                  max_length: int = None,
-                 cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
+                 cache_dir: str = None,
                  **kwargs):
         """
         Args:
@@ -80,11 +70,11 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
             max_length: The max length of input sequence. Default: None.
             **kwargs: Other args.
         """
-        model_cache_dir = get_model_cache_dir(cache_dir)
+        model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
         self.model_id: str = model_id
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        logger.warning(f'**Device: {self.device}')
+        logger.warning(f'Device: {self.device}')
         torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
@@ -93,31 +83,21 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
         model_cfg['device_map'] = device_map
         model_cfg['torch_dtype'] = str(torch_dtype)
-        from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
-        # from modelscope import snapshot_download
-        # try:
-        #     model_dir = snapshot_download(self.model_id, cache_dir=model_cache_dir, local_files_only=True)
-        #     logger.warning('**Use local_files_only to load model **')
-        # except:
-        #     model_dir = snapshot_download(self.model_id,
-        #                                   revision=model_revision,
-        #                                   cache_dir=model_cache_dir, )
-        #     logger.warning('**Load model from ModelScope hub **')
-        tokenizer = AutoTokenizer.from_pretrained(self.model_id,    # self.model_id
-                                                  revision=model_revision,
-                                                  trust_remote_code=True,
-                                                  cache_dir=model_cache_dir,)
-        model = AutoModelForCausalLM.from_pretrained(self.model_id,  # self.model_id
-                                                     revision=model_revision,
-                                                     device_map=device_map,
-                                                     trust_remote_code=True,
-                                                     torch_dtype=torch_dtype,
-                                                     cache_dir=model_cache_dir,)
-        # model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id,  # self.model_id
+            revision=model_revision,
+            trust_remote_code=True,
+            cache_dir=model_cache_dir,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,  # self.model_id
+            revision=model_revision,
+            device_map=device_map,
+            trust_remote_code=True,
+            torch_dtype=torch_dtype,
+            cache_dir=model_cache_dir,
+        )
         super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
@@ -187,18 +167,16 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
         if softval.dtype in {torch.bfloat16, torch.float16}:
             softval = softval.to(dtype=torch.float32)
         probs = softval.detach().cpu().numpy()
-        pred: str = multi_choices[int(np.argmax(probs))]        # Format: A or B or C or D
+        pred: str = multi_choices[int(np.argmax(probs))]  # Format: A or B or C or D
         res_d = {
-            'choices': [
-                {
-                    'index': 0,
-                    'message': {
-                        'content': pred,
-                        'role': 'assistant'
-                    }
+            'choices': [{
+                'index': 0,
+                'message': {
+                    'content': pred,
+                    'role': 'assistant'
                 }
-            ],
+            }],
             'created': time.time(),
             'model': self.model_id,
             'object': 'chat.completion',
@@ -226,7 +204,7 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
                  device_map: str = 'auto',
                  torch_dtype: dtype = torch.bfloat16,
                  model_revision: str = None,
-                 cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
+                 cache_dir: str = None,
                  **kwargs):
         """
         Continuation-logits model adapter.
@@ -239,12 +217,13 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
             **kwargs: Other args.
         """
-        super().__init__(model_id=model_id,
-                         device_map=device_map,
-                         torch_dtype=torch_dtype,
-                         model_revision=model_revision,
-                         cache_dir=cache_dir,
-                         **kwargs)
+        super().__init__(
+            model_id=model_id,
+            device_map=device_map,
+            torch_dtype=torch_dtype,
+            model_revision=model_revision,
+            cache_dir=cache_dir,
+            **kwargs)
     @torch.no_grad()
     def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
@@ -282,15 +261,13 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
         pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
         res_d = {
-            'choices': [
-                {
-                    'index': 0,
-                    'message': {
-                        'content': pred_list,
-                        'role': 'assistant'
-                    }
+            'choices': [{
+                'index': 0,
+                'message': {
+                    'content': pred_list,
+                    'role': 'assistant'
                 }
-            ],
+            }],
             'created': time.time(),
             'model': self.model_id,
             'object': 'chat.completion',
@@ -347,10 +324,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
     def __init__(self,
                  model_id: str,
-                 model_revision: str,
+                 model_revision: str = 'master',
                  device_map: str = 'auto',
-                 torch_dtype: dtype = torch.float16,
-                 cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
+                 torch_dtype: dtype = 'auto',
+                 cache_dir: str = None,
                  **kwargs):
         """
         Chat completion model adapter. Tasks of chat and generation are supported.
@@ -359,17 +336,18 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
             model_id: The model id on ModelScope, or local model_dir.
             model_revision: The model revision on ModelScope. Default: None.
             device_map: The device map for model inference.
-            torch_dtype: The torch dtype for model inference. Default: torch.float16.
+            torch_dtype: The torch dtype for model inference. Default: 'auto'.
             **kwargs: Other args.
         """
         custom_generation_config = kwargs.pop('generation_config', None)
-        model_cache_dir = get_model_cache_dir(root_cache_dir=cache_dir)
+        custom_chat_template = kwargs.pop('chat_template', None)
+        model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
         self.model_id: str = model_id
         self.model_revision: str = model_revision
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        logger.warning(f'**Device: {self.device}')
+        logger.warning(f'Device: {self.device}')
         torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
@@ -378,72 +356,47 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         model_cfg['device_map'] = device_map
         model_cfg['torch_dtype'] = str(torch_dtype)
-        self.template_type = kwargs.pop('template_type', None)
-        logger.warning(f'**Template type: {self.template_type}')
-        from evalscope.models.template import TemplateType
-        if isinstance(self.model_id, str) \
-                and os.path.isdir(os.path.expanduser(self.model_id)) \
-                and self.template_type is None:
-            raise ValueError(f'Please specify the --template-type for local model dir.\n'
-                             f'Available template types: {TemplateType.get_template_name_list()}\n'
-                             f'Refer to `https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md` for more details.')
-        from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
-        # from modelscope import snapshot_download
-        # try:
-        #     model_dir = snapshot_download(self.model_id, cache_dir=model_cache_dir, local_files_only=True)
-        #     logger.warning('**Use local_files_only to load model **')
-        # except:
-        #     model_dir = snapshot_download(self.model_id,
-        #                                   revision=model_revision,
-        #                                   cache_dir=model_cache_dir, )
-        #     logger.warning('**Load model from ModelScope hub **')
-        tokenizer = AutoTokenizer.from_pretrained(self.model_id,
-                                                  revision=model_revision,
-                                                  trust_remote_code=True,
-                                                  cache_dir=model_cache_dir,)
-        model = AutoModelForCausalLM.from_pretrained(self.model_id,
-                                                     revision=model_revision,
-                                                     device_map=device_map,
-                                                     trust_remote_code=True,
-                                                     torch_dtype=torch_dtype,
-                                                     cache_dir=model_cache_dir,)
-        self.origin_tokenizer = deepcopy(tokenizer)
-        self.generation_config, self.generation_template = self._parse_generation_config(tokenizer, model)
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id,
+            revision=model_revision,
+            trust_remote_code=True,
+            cache_dir=model_cache_dir,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            revision=model_revision,
+            device_map=device_map,
+            trust_remote_code=True,
+            torch_dtype=torch_dtype,
+            cache_dir=model_cache_dir,
+        )
+        self.generation_config = self._parse_generation_config(tokenizer, model)
         if custom_generation_config:
-            logger.info('**Updating generation config ...')
-            self.generation_config.update(**custom_generation_config.to_dict())
-        logger.info(f'**Generation config init: {self.generation_config.to_dict()}')
+            logger.info('Updating generation config ...')
+            self.generation_config.update(**custom_generation_config)
-        super().__init__(model=model, tokenizer=self.generation_template.tokenizer, model_cfg=model_cfg)
+        if custom_chat_template:
+            tokenizer.chat_template = custom_chat_template
+            logger.info(f'Using custom chat template: {custom_chat_template}')
-    def _parse_generation_config(self, tokenizer, model):
-        from modelscope.utils.hf_util import GenerationConfig
+        super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
-        generation_config = getattr(model, 'generation_config', GenerationConfig())
+    def _parse_generation_config(self, tokenizer, model):
+        generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
         try:
             remote_config = GenerationConfig.from_pretrained(
-                self.model_id,
-                revision=self.model_revision,
-                trust_remote_code=True)
+                self.model_id, revision=self.model_revision, trust_remote_code=True)
             generation_config.update(**remote_config.to_dict())
         except:
             logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
-        # Parse templates for chat-completion
         if isinstance(self.model_id, str) and os.path.exists(self.model_id):
             logger.warning(f'Got local model dir: {self.model_id}')
-        generation_template = get_template(template_type=self.template_type, tokenizer=tokenizer)
         if tokenizer.eos_token_id is not None:
             generation_config.eos_token_id = tokenizer.eos_token_id
         if tokenizer.pad_token_id is not None:
@@ -451,24 +404,19 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         if generation_config.max_new_tokens is None:
             generation_config.max_new_tokens = 2048
-        return generation_config, generation_template
+        return generation_config
     def _model_generate(self, query: str, infer_cfg: dict) -> str:
-        example = dict(query=query,
-                       history=[],
-                       system=None)
-        inputs, _ = self.generation_template.encode(example)
+        messages = [ChatMessage(role='user', content=query)]
+        formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
         input_ids = inputs['input_ids']
-        input_ids = torch.tensor(input_ids)[None].to(self.device)
-        attention_mask = torch.ones_like(input_ids).to(self.device)
         # Process infer_cfg
-        infer_cfg = infer_cfg or {}
         if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
             infer_cfg['do_sample'] = True
-        # TODO: stop settings
+        # stop settings
         stop = infer_cfg.get('stop', None)
         eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
             if stop else self.tokenizer.eos_token_id
@@ -478,25 +426,16 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
             infer_cfg['pad_token_id'] = eos_token_id  # setting eos_token_id as pad token
         self.generation_config.update(**infer_cfg)
-        # stopping
-        stop_words = [self.generation_template.suffix[-1]]
-        decode_kwargs = {}
-        stopping_criteria = StoppingCriteriaList(
-            [StopWordsCriteria(self.tokenizer, stop_words, **decode_kwargs)])
+        fix_do_sample_warning(self.generation_config)
         # Run inference
-        output_ids = self.model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            generation_config=self.generation_config,
-            stopping_criteria=stopping_criteria, )
+        output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
-        response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], True, **decode_kwargs)
+        response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
         return response
     @torch.no_grad()
-    def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = dict({})) -> dict:
+    def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
         # Process inputs
         if isinstance(inputs, str):
@@ -510,12 +449,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         response = self._model_generate(query, infer_cfg)
-        choices_list = [
-            {'index': 0,
-             'message': {'content': response,
-                         'role': 'assistant'}
-             }
-        ]
+        choices_list = [{'index': 0, 'message': {'content': response, 'role': 'assistant'}}]
         res_d = {
             'choices': choices_list,
@@ -589,4 +523,3 @@ class CustomModelAdapter(BaseModelAdapter):
                 raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
         return self.custom_model.predict(prompts=in_prompts, **kwargs)

evalscope/models/openai_model.py CHANGED Viewed

@@ -1,10 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import openai
 import os
 import time
-import openai
 from evalscope.models import ChatBaseModel
 from evalscope.utils.logger import get_logger
@@ -43,22 +42,25 @@ class OpenAIModel(ChatBaseModel):
         logger.info(f'Using OpenAI model_id: {model_id}')
-        res = self._predict(model_id=model_id,
-                            sys_prompt=sys_prompt,
-                            user_prompt=user_prompt,
-                            temperature=temperature,
-                            max_tokens=max_tokens,
-                            mode=mode)
+        res = self._predict(
+            model_id=model_id,
+            sys_prompt=sys_prompt,
+            user_prompt=user_prompt,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            mode=mode)
         return res
-    def _predict(self,
-                 model_id,
-                 sys_prompt,
-                 user_prompt,
-                 temperature,
-                 max_tokens,
-                 mode: str = 'chat.completion',) -> dict:
+    def _predict(
+        self,
+        model_id,
+        sys_prompt,
+        user_prompt,
+        temperature,
+        max_tokens,
+        mode: str = 'chat.completion',
+    ) -> dict:
         res = {}
         openai.api_key = self.api_key
@@ -82,9 +84,8 @@ class OpenAIModel(ChatBaseModel):
                         ans_text = resp['choices'][0]['message']['content']
                         model_id = resp['model']
                     else:
-                        logger.warning(
-                            f'OpenAI GPT API call failed: got empty response '
-                            f'for input {sys_prompt} {user_prompt}')
+                        logger.warning(f'OpenAI GPT API call failed: got empty response '
+                                       f'for input {sys_prompt} {user_prompt}')
                         ans_text = ''
                         model_id = ''
@@ -98,6 +99,5 @@ class OpenAIModel(ChatBaseModel):
             except Exception as e:
                 logger.warning(f'OpenAI API call failed: {e}')
                 time.sleep(3)
-        logger.error(
-            f'OpenAI API call failed after {self.MAX_RETRIES} retries')
+        logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
         return res

evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl