PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/tools/combine_reports.py CHANGED Viewed

@@ -1,13 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-import json
 import glob
+import json
+import os
+from collections import defaultdict
 from tabulate import tabulate
 from evalscope.utils.logger import get_logger
 logger = get_logger()
 """
 Combine and generate table for reports of LLMs.
 """
@@ -15,8 +16,9 @@ Combine and generate table for reports of LLMs.
 def get_report(report_file: str):
     data_d: dict = json.load(open(report_file, 'r'))
-    dataset_name = data_d['name']
-    score = data_d['score']     # float or dict
+    dataset_name = data_d['dataset_name']
+    model_name = data_d['model_name']
+    score = data_d['score']  # float or dict
     score_d = {}
     if isinstance(score, dict):
         # score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
@@ -29,19 +31,16 @@ def get_report(report_file: str):
     # score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
     score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
-    return {'dataset_name': dataset_name, 'score': score_str}
+    return model_name, {'dataset_name': dataset_name, 'score': score_str}
 def get_model_reports(model_report_dir: str):
     model_report_dir = os.path.normpath(model_report_dir)
-    model_report_dir = model_report_dir.rstrip('reports')
-    model_info = os.path.basename(os.path.normpath(model_report_dir))
-    model_name = '_'.join(model_info.split('_')[:-1][3:])
-    report_files = glob.glob(os.path.join(model_report_dir, 'reports', '*.json'))
+    report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
-    model_reports_d = {model_name: []}
+    model_reports_d = defaultdict(list)
     for file_path in report_files:
-        report_d = get_report(file_path)
+        model_name, report_d = get_report(file_path)
         model_reports_d[model_name].append(report_d)
     return model_reports_d
@@ -55,8 +54,6 @@ def gen_table(reports_path_list: list):
     for report_path in reports_path_list:
         model_reports_d = get_model_reports(report_path)
         for model_name, report_list in model_reports_d.items():
-            # report_list: [{'dataset_name': 'CompetitionMath', 'score': '4.42 (acc)'},
-            #               {'dataset_name': 'GSM8K', 'score': '28.51 (acc)'}]
             report_list = sorted(report_list, key=lambda x: x['dataset_name'])
             if not is_headers_set:
                 headers.extend([x['dataset_name'] for x in report_list])
@@ -71,37 +68,34 @@ def gen_table(reports_path_list: list):
     report_table = tabulate(table_values, headers=headers, tablefmt='grid')
     return report_table
 class ReportsRecorder:
     COMMON_DATASET_PATH = []
     CUSTOM_DATASET_PATH = []
-    def __init__(self, oss_url: str = "", endpoint: str = ""):
+    def __init__(self, oss_url: str = '', endpoint: str = ''):
         if oss_url and endpoint:
             import oss2
             from oss2.credentials import EnvironmentVariableCredentialsProvider
             auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
-            oss_url = oss_url.replace("oss://", "").split('/')
+            oss_url = oss_url.replace('oss://', '').split('/')
             bucket_name = oss_url[0]
-            self.object_path = "/".join(oss_url[1:])
+            self.object_path = '/'.join(oss_url[1:])
             self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
         else:
-            self.object_path = ""
+            self.object_path = ''
             self.bucket = None
     def append_path(self, report_path: str, dataset_name: str):
-        if dataset_name == "general_qa":
+        if dataset_name == 'general_qa':
             self.CUSTOM_DATASET_PATH.append(report_path)
         else:
             self.COMMON_DATASET_PATH.append(report_path)
     def dump_reports(self, output_dir: str):
-        result = {
-            "CommonDataset": [],
-            "CustomDataset": []
-        }
+        result = {'CommonDataset': [], 'CustomDataset': []}
         for line in self.COMMON_DATASET_PATH:
             with open(line, 'r') as f:
                 report = json.load(f)
@@ -109,20 +103,21 @@ class ReportsRecorder:
         for line in self.CUSTOM_DATASET_PATH:
             with open(line, 'r') as f:
                 report = json.load(f)
-                report.update({"name": os.path.basename(line)})
+                report.update({'name': os.path.basename(line)})
                 result['CustomDataset'].append(report)
         os.makedirs(output_dir, exist_ok=True)
-        output_file_name = "metric.json"
+        output_file_name = 'metric.json'
         output_path = os.path.join(output_dir, output_file_name)
         with open(output_path, 'w+') as f:
             f.write(json.dumps(result, ensure_ascii=False, indent=4))
         if self.bucket:
             remote_path = os.path.join(self.object_path, output_file_name)
-            logger.info(f"** Upload report to oss: {remote_path}")
+            logger.info(f'** Upload report to oss: {remote_path}')
             self.bucket.put_object_from_file(remote_path, output_path)
 if __name__ == '__main__':
     report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
     report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'

evalscope/tools/rewrite_eval_results.py CHANGED Viewed

@@ -4,12 +4,10 @@ import time
 from evalscope.models.custom import CustomModel
 from evalscope.run import run_task
-from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
 from evalscope.utils import yaml_to_dict
 from evalscope.utils.logger import get_logger
 logger = get_logger()
 """
 This script is used to rewrite the evaluation results without re-running the model predictions.
 """
@@ -26,19 +24,20 @@ class DummyCustomModel(CustomModel):
         response = 'The answer is C. NOTE: ONLY FOR TEST'
         res_d: dict = {
-            'choices': [
-                {
-                    'index': 0,
-                    'message': {
-                        # 'content': f'The answer is B. Raw prompt: {prompt}',
-                        'content': response,
-                        'role': 'assistant'
-                    }
+            'choices': [{
+                'index': 0,
+                'message': {
+                    # 'content': f'The answer is B. Raw prompt: {prompt}',
+                    'content': response,
+                    'role': 'assistant'
                 }
-            ],
-            'created': time.time(),
-            'model': self.config.get('model_id'),  # should be model_id
-            'object': 'chat.completion',
+            }],
+            'created':
+            time.time(),
+            'model':
+            self.config.get('model_id'),  # should be model_id
+            'object':
+            'chat.completion',
             'usage': {
                 'completion_tokens': 0,
                 'prompt_tokens': 0,
@@ -49,36 +48,6 @@ class DummyCustomModel(CustomModel):
         return [res_d for _ in prompts]
-def get_task_cfg(cfg_file: str, model_instance: CustomModel):
-    if cfg_file:
-        cfg_file: str = os.path.abspath(cfg_file)
-        logger.info(f'Loading task config from {cfg_file}')
-        task_cfg_d: dict = yaml_to_dict(yaml_file=cfg_file)
-        task_cfg_d.update({'model': model_instance})
-        logger.info(f'**Task config: {task_cfg_d}')
-    else:
-        # 默认config 示例
-        task_cfg_d = {
-            'model_args': {},
-            'generation_config': {},
-            'dataset_args': {},
-            'dry_run': False,
-            'model': model_instance,  # NOTE: model_id or # model_dir or model_instance(CustomModel)
-            'eval_type': 'custom',  # NOTE: `checkpoint` or `custom` or `service`
-            'datasets': ['arc'],
-            'work_dir': DEFAULT_ROOT_CACHE_DIR,
-            'outputs': './outputs/eval_swift_dummy',
-            'mem_cache': False,
-            'dataset_hub': 'ModelScope',
-            'dataset_dir': DEFAULT_ROOT_CACHE_DIR,
-            'stage': 'all',
-            'limit': 10,
-            'debug': False
-        }
-    return task_cfg_d
 if __name__ == '__main__':
     # step1: 如果outputs做了迁移，需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
     # step2: 执行此脚本，默认使用use_cache=True，实现免推理对eval结果进行刷新
@@ -91,5 +60,4 @@ if __name__ == '__main__':
     task_cfg_d.update({'model': swift_model})
     eval_results: dict = run_task(task_cfg=task_cfg_d)
-    print(f'** Evaluation results finished !\n')
+    print('** Evaluation results finished !\n')

evalscope/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from evalscope.utils.utils import *
-from evalscope.utils.task_utils import *

evalscope/utils/arena_utils.py CHANGED Viewed

@@ -1,13 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) lmsys.org.
-import random
-from collections import OrderedDict, defaultdict
-from typing import List, Sequence, Union
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+import random
+from collections import OrderedDict, defaultdict
+from typing import List, Sequence, Union
 from evalscope.utils.logger import get_logger
@@ -25,9 +24,7 @@ def compute_elo(battles,
                 init_rating=1000):
     rating = defaultdict(lambda: init_rating)
-    for rd, model_a, model_b, win in battles[[
-            col_model_a, col_model_b, col_win
-    ]].itertuples():
+    for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
         ra = rating[model_a]
         rb = rating[model_b]
         ea = 1 / (1 + base**((rb - ra) / scale))
@@ -46,9 +43,7 @@ def compute_elo(battles,
     return rating
-def merge_ques_ans(answer_list_all,
-                   merge_key: str = 'question_id',
-                   merge_mode: str = 'inner') -> pd.DataFrame:
+def merge_ques_ans(answer_list_all, merge_key: str = 'question_id', merge_mode: str = 'inner') -> pd.DataFrame:
     """
     Merge question and answer list to unifiled data.
@@ -67,18 +62,11 @@ def merge_ques_ans(answer_list_all,
     """
     ans_df = pd.DataFrame()
     for ans_list in answer_list_all:
-        ans_list = [{
-            'question_id': item['question_id'],
-            item['model_id']: item
-        } for item in ans_list]
+        ans_list = [{'question_id': item['question_id'], item['model_id']: item} for item in ans_list]
         if ans_df.empty:
             ans_df = pa.Table.from_pylist(ans_list).to_pandas()
         else:
-            ans_df = pd.merge(
-                ans_df,
-                pa.Table.from_pylist(ans_list).to_pandas(),
-                on=merge_key,
-                how=merge_mode)
+            ans_df = pd.merge(ans_df, pa.Table.from_pylist(ans_list).to_pandas(), on=merge_key, how=merge_mode)
     return ans_df
@@ -112,21 +100,17 @@ def get_battle_pairs(columns: List[str], baseline_idx: int = -1) -> List[tuple]:
     if baseline_idx != -1:
         n_column = columns[baseline_idx]
-        res_list = [(column, n_column) for column in columns
-                    if column != n_column]
+        res_list = [(column, n_column) for column in columns if column != n_column]
     else:
         mat = np.ones((cols_num, cols_num))
         mat_lower_tril = np.tril(mat, k=-1)
         x_ids, y_ids = np.where(mat_lower_tril == 1)
-        res_list = [(columns[x_id], columns[y_id])
-                    for x_id, y_id in zip(x_ids, y_ids)]
+        res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
     return res_list
-def get_battle_pairs_origin(columns: List[str],
-                            compare_base: bool = False,
-                            swap: bool = False):  # TODO: to refactor
+def get_battle_pairs_origin(columns: List[str], compare_base: bool = False, swap: bool = False):  # TODO: to refactor
     """
     Get battle pair names from columns.
@@ -152,8 +136,7 @@ def get_battle_pairs_origin(columns: List[str],
         mat = np.ones((cols_num, cols_num))
         mat_lower_tril = np.tril(mat, k=-1)
         x_ids, y_ids = np.where(mat_lower_tril == 1)
-        res_list = [(columns[x_id], columns[y_id])
-                    for x_id, y_id in zip(x_ids, y_ids)]
+        res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
     else:
         for column in columns[1:]:
             res_list.append((columns[0], column))
@@ -163,8 +146,7 @@ def get_battle_pairs_origin(columns: List[str],
     return res_list
-def shuffle_pairwise_preferences(
-        df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
+def shuffle_pairwise_preferences(df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
     """Shuffle the outputs of a pairwise preference dataframe.
     Examples
@@ -182,8 +164,7 @@ def shuffle_pairwise_preferences(
     df['output_2'] = np.where(arr_is_shuffle, col_1, col_2)
     if 'preference' in df.columns:
-        df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'],
-                                    df['preference'])
+        df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'], df['preference'])
     return df
@@ -202,20 +183,14 @@ class BattlePairSelection:
         # Make sure model_elo_map to be ordered when compare_base is true.
         self.model_elo_map = model_elo_map
-    def top_k(self,
-              k: int = DEFAULT_K,
-              compare_base: bool = False,
-              swap: bool = False) -> list:
+    def top_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
         if k <= 0:
             k = self.DEFAULT_K
         sorted_res = sorted(self.model_elo_map.items(), key=lambda x: x[1])[:k]
         sorted_res = list(dict(sorted_res).keys())
         return get_battle_pairs_origin(sorted_res, compare_base, swap)
-    def random_k(self,
-                 k: int = DEFAULT_K,
-                 compare_base: bool = False,
-                 swap: bool = False) -> list:
+    def random_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
         if k <= 0:
             k = self.DEFAULT_K
         if k > len(self.model_elo_map):
@@ -226,21 +201,16 @@ class BattlePairSelection:
         res = list(res.keys())
         return get_battle_pairs_origin(res, compare_base, swap)
-    def volatility_index(self,
-                         frac: float = 0.2,
-                         compare_base: bool = False,
-                         swap: bool = False) -> list:
+    def volatility_index(self, frac: float = 0.2, compare_base: bool = False, swap: bool = False) -> list:
         res_list = []
-        candidate_list = get_battle_pairs_origin(
-            list(self.model_elo_map.keys()), compare_base, swap)
+        candidate_list = get_battle_pairs_origin(list(self.model_elo_map.keys()), compare_base, swap)
         for t in candidate_list:
             model_a = t[0]
             model_b = t[1]
             base_elo_a = self.model_elo_map.get(model_a)
             base_elo_b = self.model_elo_map.get(model_b)
-            vol_frac = abs(base_elo_b - base_elo_a) / max(
-                base_elo_a, base_elo_b)
+            vol_frac = abs(base_elo_b - base_elo_a) / max(base_elo_a, base_elo_b)
             if vol_frac <= frac:
                 res_list.append(t)

evalscope/{perf/utils → utils}/chat_service.py RENAMED Viewed

@@ -1,14 +1,13 @@
 import os
 import time
+import torch
 from contextlib import contextmanager
 from functools import partial
-from threading import Thread
-from typing import List, Literal, Optional, Union
-import torch
 from modelscope import AutoModelForCausalLM, AutoTokenizer
 from pydantic import BaseModel, Field
+from threading import Thread
 from transformers import TextIteratorStreamer
+from typing import List, Literal, Optional, Union
 class Usage(BaseModel):

evalscope/utils/completion_parsers.py CHANGED Viewed

@@ -4,7 +4,6 @@
 import ast
 import re
 # from . import utils as ann_utils
 from evalscope.constants import ArenaWinner
 from evalscope.utils.logger import get_logger
@@ -51,9 +50,7 @@ def lmsys_parser(completion, output_format):
             else:
                 raise Exception('Invalid score pair.')
         except Exception as e:
-            logger.error(
-                f'{e}\nContent: {completion}\nYou must manually fix the score pair.'
-            )
+            logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
             return ArenaWinner.UNKNOWN, [-1, -1]
     elif output_format == '[[A]]':
         if '[[A]]' in completion:
@@ -63,8 +60,7 @@ def lmsys_parser(completion, output_format):
         elif '[[C]]' in completion:
             winner = ArenaWinner.TIE
         else:
-            logger.error(
-                f'\nContent: {completion}\nYou must manually fix the score.')
+            logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
             winner = ArenaWinner.UNKNOWN
         return winner
@@ -76,8 +72,7 @@ def ranking_parser(completion, **kwargs):
         else:
             ordered_completions = completion
-        rank = [c for c in ordered_completions
-                if c['model'] == 'model_a'][0]['rank']
+        rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
         assert rank in [1, 2]
         return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B

evalscope/utils/logger.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import importlib.util as iutil
 import logging
+import os
 from typing import Optional
 init_loggers = {}
@@ -9,11 +10,12 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 detailed_formatter = logging.Formatter(detailed_format)
 simple_formatter = logging.Formatter(simple_format)
+DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
-logging.basicConfig(format=simple_format, level=logging.INFO)
+logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
-def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = 'w'):
+def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
     """Get logging logger
     Args:
@@ -29,12 +31,12 @@ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, fi
     logger.propagate = False
     if logger_name in init_loggers:
-        if logger.level != log_level:
+        if force:
             logger.setLevel(log_level)
-        add_file_handler_if_needed(logger, log_file, file_mode, log_level)
-        for handler in logger.handlers:
-            handler.setLevel(log_level)
-            handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
+            for handler in logger.handlers:
+                handler.setLevel(log_level)
+                handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
+            add_file_handler_if_needed(logger, log_file, file_mode, log_level)
         return logger
     # handle duplicate logs to the console

evalscope/utils/model_utils.py ADDED Viewed

@@ -0,0 +1,11 @@
+from transformers import GenerationConfig
+def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
+    # Use the default values of temperature/top_p/top_k in generation_config.
+    if generation_config.temperature == 0:
+        generation_config.do_sample = False
+    if generation_config.do_sample is False:
+        generation_config.temperature = 1.
+        generation_config.top_p = 1.
+        generation_config.top_k = 50

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl