PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/models/model_adapter.py CHANGED Viewed

@@ -1,35 +1,25 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) EleutherAI, Inc. and its affiliates.
 # flake8: noqa
+import numpy as np
 import os
 import sys
-from typing import List, Any, Union, Dict
-import numpy as np
 import time
+import torch
 from abc import ABC, abstractmethod
 from copy import deepcopy
-import torch
+from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 from torch import dtype
+from typing import Any, Dict, List, Union
-from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
+from evalscope.constants import DEFAULT_MODEL_CACHE_DIR
 from evalscope.models.custom import CustomModel
-from evalscope.models.template import get_template, StopWordsCriteria
+from evalscope.utils.chat_service import ChatMessage
 from evalscope.utils.logger import get_logger
-from transformers import StoppingCriteriaList
+from evalscope.utils.model_utils import fix_do_sample_warning
 logger = get_logger()
-# Notes:
-# - modelscope>=1.9.5
-def get_model_cache_dir(root_cache_dir: str):
-    model_cache_dir = os.path.join(root_cache_dir, 'models')
-    model_cache_dir = os.path.expanduser(model_cache_dir)
-    os.makedirs(model_cache_dir, exist_ok=True)
-    return model_cache_dir
 class BaseModelAdapter(ABC):
     """
@@ -69,7 +59,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
                  torch_dtype: dtype = torch.bfloat16,
                  model_revision: str = None,
                  max_length: int = None,
-                 cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
+                 cache_dir: str = None,
                  **kwargs):
         """
         Args:
@@ -80,11 +70,11 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
             max_length: The max length of input sequence. Default: None.
             **kwargs: Other args.
         """
-        model_cache_dir = get_model_cache_dir(cache_dir)
+        model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
         self.model_id: str = model_id
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        logger.warning(f'**Device: {self.device}')
+        logger.warning(f'Device: {self.device}')
         torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
@@ -93,31 +83,21 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
         model_cfg['device_map'] = device_map
         model_cfg['torch_dtype'] = str(torch_dtype)
-        from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
-        # from modelscope import snapshot_download
-        # try:
-        #     model_dir = snapshot_download(self.model_id, cache_dir=model_cache_dir, local_files_only=True)
-        #     logger.warning('**Use local_files_only to load model **')
-        # except:
-        #     model_dir = snapshot_download(self.model_id,
-        #                                   revision=model_revision,
-        #                                   cache_dir=model_cache_dir, )
-        #     logger.warning('**Load model from ModelScope hub **')
-        tokenizer = AutoTokenizer.from_pretrained(self.model_id,    # self.model_id
-                                                  revision=model_revision,
-                                                  trust_remote_code=True,
-                                                  cache_dir=model_cache_dir,)
-        model = AutoModelForCausalLM.from_pretrained(self.model_id,  # self.model_id
-                                                     revision=model_revision,
-                                                     device_map=device_map,
-                                                     trust_remote_code=True,
-                                                     torch_dtype=torch_dtype,
-                                                     cache_dir=model_cache_dir,)
-        # model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id,  # self.model_id
+            revision=model_revision,
+            trust_remote_code=True,
+            cache_dir=model_cache_dir,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,  # self.model_id
+            revision=model_revision,
+            device_map=device_map,
+            trust_remote_code=True,
+            torch_dtype=torch_dtype,
+            cache_dir=model_cache_dir,
+        )
         super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
@@ -187,18 +167,16 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
         if softval.dtype in {torch.bfloat16, torch.float16}:
             softval = softval.to(dtype=torch.float32)
         probs = softval.detach().cpu().numpy()
-        pred: str = multi_choices[int(np.argmax(probs))]        # Format: A or B or C or D
+        pred: str = multi_choices[int(np.argmax(probs))]  # Format: A or B or C or D
         res_d = {
-            'choices': [
-                {
-                    'index': 0,
-                    'message': {
-                        'content': pred,
-                        'role': 'assistant'
-                    }
+            'choices': [{
+                'index': 0,
+                'message': {
+                    'content': pred,
+                    'role': 'assistant'
                 }
-            ],
+            }],
             'created': time.time(),
             'model': self.model_id,
             'object': 'chat.completion',
@@ -226,7 +204,7 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
                  device_map: str = 'auto',
                  torch_dtype: dtype = torch.bfloat16,
                  model_revision: str = None,
-                 cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
+                 cache_dir: str = None,
                  **kwargs):
         """
         Continuation-logits model adapter.
@@ -239,12 +217,13 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
             **kwargs: Other args.
         """
-        super().__init__(model_id=model_id,
-                         device_map=device_map,
-                         torch_dtype=torch_dtype,
-                         model_revision=model_revision,
-                         cache_dir=cache_dir,
-                         **kwargs)
+        super().__init__(
+            model_id=model_id,
+            device_map=device_map,
+            torch_dtype=torch_dtype,
+            model_revision=model_revision,
+            cache_dir=cache_dir,
+            **kwargs)
     @torch.no_grad()
     def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
@@ -282,15 +261,13 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
         pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
         res_d = {
-            'choices': [
-                {
-                    'index': 0,
-                    'message': {
-                        'content': pred_list,
-                        'role': 'assistant'
-                    }
+            'choices': [{
+                'index': 0,
+                'message': {
+                    'content': pred_list,
+                    'role': 'assistant'
                 }
-            ],
+            }],
             'created': time.time(),
             'model': self.model_id,
             'object': 'chat.completion',
@@ -347,10 +324,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
     def __init__(self,
                  model_id: str,
-                 model_revision: str,
+                 model_revision: str = 'master',
                  device_map: str = 'auto',
-                 torch_dtype: dtype = torch.float16,
-                 cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
+                 torch_dtype: dtype = 'auto',
+                 cache_dir: str = None,
                  **kwargs):
         """
         Chat completion model adapter. Tasks of chat and generation are supported.
@@ -359,17 +336,18 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
             model_id: The model id on ModelScope, or local model_dir.
             model_revision: The model revision on ModelScope. Default: None.
             device_map: The device map for model inference.
-            torch_dtype: The torch dtype for model inference. Default: torch.float16.
+            torch_dtype: The torch dtype for model inference. Default: 'auto'.
             **kwargs: Other args.
         """
         custom_generation_config = kwargs.pop('generation_config', None)
-        model_cache_dir = get_model_cache_dir(root_cache_dir=cache_dir)
+        custom_chat_template = kwargs.pop('chat_template', None)
+        model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
         self.model_id: str = model_id
         self.model_revision: str = model_revision
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        logger.warning(f'**Device: {self.device}')
+        logger.warning(f'Device: {self.device}')
         torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
@@ -378,72 +356,47 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         model_cfg['device_map'] = device_map
         model_cfg['torch_dtype'] = str(torch_dtype)
-        self.template_type = kwargs.pop('template_type', None)
-        logger.warning(f'**Template type: {self.template_type}')
-        from evalscope.models.template import TemplateType
-        if isinstance(self.model_id, str) \
-                and os.path.isdir(os.path.expanduser(self.model_id)) \
-                and self.template_type is None:
-            raise ValueError(f'Please specify the --template-type for local model dir.\n'
-                             f'Available template types: {TemplateType.get_template_name_list()}\n'
-                             f'Refer to `https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md` for more details.')
-        from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
-        # from modelscope import snapshot_download
-        # try:
-        #     model_dir = snapshot_download(self.model_id, cache_dir=model_cache_dir, local_files_only=True)
-        #     logger.warning('**Use local_files_only to load model **')
-        # except:
-        #     model_dir = snapshot_download(self.model_id,
-        #                                   revision=model_revision,
-        #                                   cache_dir=model_cache_dir, )
-        #     logger.warning('**Load model from ModelScope hub **')
-        tokenizer = AutoTokenizer.from_pretrained(self.model_id,
-                                                  revision=model_revision,
-                                                  trust_remote_code=True,
-                                                  cache_dir=model_cache_dir,)
-        model = AutoModelForCausalLM.from_pretrained(self.model_id,
-                                                     revision=model_revision,
-                                                     device_map=device_map,
-                                                     trust_remote_code=True,
-                                                     torch_dtype=torch_dtype,
-                                                     cache_dir=model_cache_dir,)
-        self.origin_tokenizer = deepcopy(tokenizer)
-        self.generation_config, self.generation_template = self._parse_generation_config(tokenizer, model)
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_id,
+            revision=model_revision,
+            trust_remote_code=True,
+            cache_dir=model_cache_dir,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            revision=model_revision,
+            device_map=device_map,
+            trust_remote_code=True,
+            torch_dtype=torch_dtype,
+            cache_dir=model_cache_dir,
+        )
+        self.generation_config = self._parse_generation_config(tokenizer, model)
         if custom_generation_config:
-            logger.info('**Updating generation config ...')
-            self.generation_config.update(**custom_generation_config.to_dict())
-        logger.info(f'**Generation config init: {self.generation_config.to_dict()}')
+            logger.info('Updating generation config ...')
+            self.generation_config.update(**custom_generation_config)
-        super().__init__(model=model, tokenizer=self.generation_template.tokenizer, model_cfg=model_cfg)
+        if custom_chat_template:
+            tokenizer.chat_template = custom_chat_template
+            logger.info(f'Using custom chat template: {custom_chat_template}')
-    def _parse_generation_config(self, tokenizer, model):
-        from modelscope.utils.hf_util import GenerationConfig
+        super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
-        generation_config = getattr(model, 'generation_config', GenerationConfig())
+    def _parse_generation_config(self, tokenizer, model):
+        generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
         try:
             remote_config = GenerationConfig.from_pretrained(
-                self.model_id,
-                revision=self.model_revision,
-                trust_remote_code=True)
+                self.model_id, revision=self.model_revision, trust_remote_code=True)
             generation_config.update(**remote_config.to_dict())
         except:
             logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
-        # Parse templates for chat-completion
         if isinstance(self.model_id, str) and os.path.exists(self.model_id):
             logger.warning(f'Got local model dir: {self.model_id}')
-        generation_template = get_template(template_type=self.template_type, tokenizer=tokenizer)
         if tokenizer.eos_token_id is not None:
             generation_config.eos_token_id = tokenizer.eos_token_id
         if tokenizer.pad_token_id is not None:
@@ -451,24 +404,19 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         if generation_config.max_new_tokens is None:
             generation_config.max_new_tokens = 2048
-        return generation_config, generation_template
+        return generation_config
     def _model_generate(self, query: str, infer_cfg: dict) -> str:
-        example = dict(query=query,
-                       history=[],
-                       system=None)
-        inputs, _ = self.generation_template.encode(example)
+        messages = [ChatMessage(role='user', content=query)]
+        formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
         input_ids = inputs['input_ids']
-        input_ids = torch.tensor(input_ids)[None].to(self.device)
-        attention_mask = torch.ones_like(input_ids).to(self.device)
         # Process infer_cfg
-        infer_cfg = infer_cfg or {}
         if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
             infer_cfg['do_sample'] = True
-        # TODO: stop settings
+        # stop settings
         stop = infer_cfg.get('stop', None)
         eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
             if stop else self.tokenizer.eos_token_id
@@ -478,25 +426,16 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
             infer_cfg['pad_token_id'] = eos_token_id  # setting eos_token_id as pad token
         self.generation_config.update(**infer_cfg)
-        # stopping
-        stop_words = [self.generation_template.suffix[-1]]
-        decode_kwargs = {}
-        stopping_criteria = StoppingCriteriaList(
-            [StopWordsCriteria(self.tokenizer, stop_words, **decode_kwargs)])
+        fix_do_sample_warning(self.generation_config)
         # Run inference
-        output_ids = self.model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            generation_config=self.generation_config,
-            stopping_criteria=stopping_criteria, )
+        output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
-        response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], True, **decode_kwargs)
+        response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
         return response
     @torch.no_grad()
-    def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = dict({})) -> dict:
+    def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
         # Process inputs
         if isinstance(inputs, str):
@@ -510,12 +449,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         response = self._model_generate(query, infer_cfg)
-        choices_list = [
-            {'index': 0,
-             'message': {'content': response,
-                         'role': 'assistant'}
-             }
-        ]
+        choices_list = [{'index': 0, 'message': {'content': response, 'role': 'assistant'}}]
         res_d = {
             'choices': choices_list,
@@ -589,4 +523,3 @@ class CustomModelAdapter(BaseModelAdapter):
                 raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
         return self.custom_model.predict(prompts=in_prompts, **kwargs)

evalscope/models/openai_model.py CHANGED Viewed

@@ -1,10 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import openai
 import os
 import time
-import openai
 from evalscope.models import ChatBaseModel
 from evalscope.utils.logger import get_logger
@@ -43,22 +42,25 @@ class OpenAIModel(ChatBaseModel):
         logger.info(f'Using OpenAI model_id: {model_id}')
-        res = self._predict(model_id=model_id,
-                            sys_prompt=sys_prompt,
-                            user_prompt=user_prompt,
-                            temperature=temperature,
-                            max_tokens=max_tokens,
-                            mode=mode)
+        res = self._predict(
+            model_id=model_id,
+            sys_prompt=sys_prompt,
+            user_prompt=user_prompt,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            mode=mode)
         return res
-    def _predict(self,
-                 model_id,
-                 sys_prompt,
-                 user_prompt,
-                 temperature,
-                 max_tokens,
-                 mode: str = 'chat.completion',) -> dict:
+    def _predict(
+        self,
+        model_id,
+        sys_prompt,
+        user_prompt,
+        temperature,
+        max_tokens,
+        mode: str = 'chat.completion',
+    ) -> dict:
         res = {}
         openai.api_key = self.api_key
@@ -82,9 +84,8 @@ class OpenAIModel(ChatBaseModel):
                         ans_text = resp['choices'][0]['message']['content']
                         model_id = resp['model']
                     else:
-                        logger.warning(
-                            f'OpenAI GPT API call failed: got empty response '
-                            f'for input {sys_prompt} {user_prompt}')
+                        logger.warning(f'OpenAI GPT API call failed: got empty response '
+                                       f'for input {sys_prompt} {user_prompt}')
                         ans_text = ''
                         model_id = ''
@@ -98,6 +99,5 @@ class OpenAIModel(ChatBaseModel):
             except Exception as e:
                 logger.warning(f'OpenAI API call failed: {e}')
                 time.sleep(3)
-        logger.error(
-            f'OpenAI API call failed after {self.MAX_RETRIES} retries')
+        logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
         return res

evalscope/perf/arguments.py CHANGED Viewed

@@ -1,18 +1,22 @@
 import argparse
+import json
+import os
 import sys
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional
-import json
+from evalscope.constants import DEFAULT_WORK_DIR
 @dataclass
 class Arguments:
     # Model and API
-    model: str  # Model identifier
+    model: str  # Model name or path
+    model_id: Optional[str] = None  # Model identifier
     attn_implementation: Optional[str] = None  # Attention implementaion, only for local inference
     api: str = 'openai'  # API to be used (default: 'openai')
     tokenizer_path: Optional[str] = None  # Path to the tokenizer
+    port: str = '8877'  # Port number for the local API server
     # Connection settings
     url: str = 'http://127.0.0.1:8877/v1/chat/completions'  # URL for the API connection
@@ -32,6 +36,9 @@ class Arguments:
     wandb_api_key: Optional[str] = None  # WandB API key for logging
     name: Optional[str] = None  # Name for the run
+    # Output settings
+    outputs_dir: str = DEFAULT_WORK_DIR
     # Prompt settings
     max_prompt_length: int = sys.maxsize  # Maximum length of the prompt
     min_prompt_length: int = 0  # Minimum length of the prompt
@@ -57,7 +64,6 @@ class Arguments:
     @staticmethod
     def from_args(args):
         return Arguments(
             model=args.model,
             attn_implementation=args.attn_implementation,
@@ -72,6 +78,7 @@ class Arguments:
             headers=args.headers,
             wandb_api_key=args.wandb_api_key,
             name=args.name,
+            outputs_dir=args.outputs_dir,
             debug=args.debug,
             tokenizer_path=args.tokenizer_path,
             api=args.api,
@@ -98,6 +105,7 @@ class Arguments:
         if self.api_key:
             # Assuming the API key is used as a Bearer token
             self.headers['Authorization'] = f'Bearer {self.api_key}'
+        self.model_id = os.path.basename(self.model)
     def __str__(self):
         return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
@@ -152,6 +160,9 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
     parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
+    # Output settings
+    parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
     # Dataset settings
     parser.add_argument('--dataset', type=str, default='openqa', help='Specify the dataset')
     parser.add_argument('--dataset-path', type=str, required=False, help='Path to the dataset file')
@@ -170,6 +181,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
     parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
     parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
     # yapf: enable

evalscope/perf/benchmark.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import asyncio
 import copy
+import json
+import numpy as np
 import os
 import platform
 import sqlite3
 import threading
 import time
 from http import HTTPStatus
-from typing import List
-import json
-import numpy as np
 from tqdm import tqdm
+from typing import List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -138,17 +137,17 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
     api_plugin_class = ApiRegistry(args.api)
     api_plugin = api_plugin_class(args.tokenizer_path)
-    result_db_path = get_result_db_path(args.name, args.model)
+    result_db_path = get_result_db_path(args)
     # Initialize wandb
     if args.wandb_api_key:
-        import wandb
         import datetime
+        import wandb
         os.environ['WANDB_SILENT'] = 'true'
-        os.environ['WANDB_DIR'] = './outputs'
+        os.environ['WANDB_DIR'] = args.outputs_dir
         wandb.login(key=args.wandb_api_key)
         current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
-        name = args.name if args.name else f'{args.model}_{current_time}'
+        name = args.name if args.name else f'{args.model_id}_{current_time}'
         wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
     with sqlite3.connect(result_db_path) as con:
@@ -199,7 +198,6 @@ async def start_server(args: Arguments) -> bool:
             args.url = 'http://127.0.0.1:8877/v1/completions'
         else:
             args.url = 'http://127.0.0.1:8877/v1/chat/completions'
-        args.model = os.path.basename(args.model)
     if not await test_connection(args):
         raise TimeoutError('Test connection failed')

evalscope/perf/http_client.py CHANGED Viewed

@@ -1,12 +1,10 @@
+import aiohttp
 import asyncio
-import logging
+import json
 import time
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Tuple
-import aiohttp
-import json
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.utils.local_server import ServerSentEvent
 from evalscope.utils.logger import get_logger
@@ -21,7 +19,6 @@ class AioHttpClient:
         args: Arguments,
     ):
         self.url = args.url
-        self.debug = args.debug
         self.headers = {'user-agent': 'modelscope_bench', **(args.headers or {})}
         self.read_timeout = args.read_timeout
         self.connect_timeout = args.connect_timeout
@@ -31,9 +28,7 @@ class AioHttpClient:
                 connect=self.connect_timeout,
                 sock_read=self.read_timeout),
             connector=aiohttp.TCPConnector(limit=1),
-            trace_configs=[self._create_trace_config()] if self.debug else [])
-        if self.debug:
-            get_logger(log_level=logging.DEBUG)
+            trace_configs=[self._create_trace_config()] if args.debug else [])
     def _create_trace_config(self):
         trace_config = aiohttp.TraceConfig()

evalscope/perf/main.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import asyncio
+import logging
+import os
 import platform
 from argparse import Namespace
 from evalscope.perf.arguments import Arguments, parse_args
 from evalscope.perf.benchmark import benchmark
+from evalscope.perf.utils.db_util import get_output_path
 from evalscope.perf.utils.handler import add_signal_handlers
 from evalscope.utils.logger import get_logger
 from evalscope.utils.utils import seed_everything
@@ -18,6 +21,13 @@ def run_perf_benchmark(args):
         args = Arguments.from_args(args)
     seed_everything(args.seed)
+    # Setup logger and output
+    args.outputs_dir = get_output_path(args)
+    get_logger(log_file=os.path.join(args.outputs_dir, 'benchmark.log'), force=True)
+    if args.debug:
+        get_logger(log_level=logging.DEBUG, force=True)
     logger.info('Starting benchmark...')
     logger.info(args)

evalscope/perf/plugin/api/custom_api.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from typing import Any, Dict, Iterator, List
 import json
 from transformers import AutoTokenizer
+from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl