PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +6 -2
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +47 -51
evalscope/backend/rag_eval/utils/embedding.py +13 -12
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +33 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +154 -96
evalscope/constants.py +50 -32
evalscope/evaluator/evaluator.py +97 -377
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +16 -3
evalscope/perf/benchmark.py +9 -11
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +8 -1
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +5 -6
evalscope/perf/utils/db_util.py +77 -30
evalscope/perf/utils/local_server.py +21 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +153 -381
evalscope/run_arena.py +21 -25
evalscope/summarizer.py +27 -40
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -27
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -4
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
evalscope/tools/combine_reports.py +27 -34
evalscope/tools/rewrite_eval_results.py +15 -47
evalscope/utils/__init__.py +1 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +4 -5
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/io_utils.py +162 -0
evalscope/utils/logger.py +17 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +5 -306
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
evalscope-0.8.1.dist-info/RECORD +285 -0
tests/cli/test_run.py +53 -15
tests/perf/test_perf.py +6 -1
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0

evalscope/cli/start_server.py CHANGED Viewed

@@ -1,67 +1,56 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os, sys, time
-from argparse import ArgumentParser
+import os
 import subprocess
+import sys
+import time
+from argparse import ArgumentParser
 from evalscope.cli.base import CLICommand
 current_path = os.path.dirname(os.path.abspath(__file__))
 print(current_path)
 root_path = os.path.dirname(current_path)
 print(root_path)
 def subparser_func(args):
     """ Function which will be called for a specific sub parser.
     """
     return PerfServerCMD(args)
 def add_perf_args(parser):
+    parser.add_argument('--server-command', required=True, type=str, help='The start server command.')
     parser.add_argument(
-        '--server-command', required=True, type=str, help='The start server command.')
-    parser.add_argument(
-        '--logdir', required=True, type=str, help='The monitor log save dir, tensorboard start at this path for display!')
-    parser.add_argument(
-        '--host', type=str, default='0.0.0.0', help='The tensorboard host'
-    )
-    parser.add_argument(
-        '--tensorboard-port', type=str, default='6006', help='The tensorboard port'
-    )
+        '--logdir',
+        required=True,
+        type=str,
+        help='The monitor log save dir, tensorboard start at this path for display!')
+    parser.add_argument('--host', type=str, default='0.0.0.0', help='The tensorboard host')
+    parser.add_argument('--tensorboard-port', type=str, default='6006', help='The tensorboard port')
 def async_run_command_with_popen(cmd):
     sub_process = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        bufsize=1,
-        universal_newlines=True,
-        encoding='utf8')
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8')
     return sub_process
 def start_monitor(args):
-    cmd = ['python',
-           '%s/perf/monitor.py'%root_path,
-           '--logdir',
-           args.logdir]
+    cmd = ['python', '%s/perf/monitor.py' % root_path, '--logdir', args.logdir]
     print(cmd)
     p = async_run_command_with_popen(cmd)
     os.set_blocking(p.stdout.fileno(), False)
     return p
 def start_tensorboard(args):
-    cmd = ['tensorboard',
-           '--logdir',
-           args.logdir,
-           '--host',
-           args.host,
-           '--port',
-           args.tensorboard_port
-           ]
+    cmd = ['tensorboard', '--logdir', args.logdir, '--host', args.host, '--port', args.tensorboard_port]
     p = async_run_command_with_popen(cmd)
     os.set_blocking(p.stdout.fileno(), False)
     return p
 def start_server(args):
     cmd = args.server_command
     print(cmd)
@@ -76,7 +65,7 @@ def start_server(args):
     os.set_blocking(sub_process.stdout.fileno(), False)
     return sub_process
 def wait_for_workers(workers):
     while True:
@@ -91,12 +80,12 @@ def wait_for_workers(workers):
                     else:
                         break
             else:
-                print('Worker %s completed!'%idx)
+                print('Worker %s completed!' % idx)
                 for line in iter(worker.stdout.readline, ''):
                     if line != '':
                         sys.stdout.write(line)
                     else:
-                        break
+                        break
                 workers[idx] = None
         is_all_completed = True
@@ -108,7 +97,8 @@ def wait_for_workers(workers):
         if is_all_completed:
             break
         time.sleep(0.1)
 class PerfServerCMD(CLICommand):
     name = 'server'
@@ -127,12 +117,8 @@ class PerfServerCMD(CLICommand):
         # start monitor
         p_monitor = start_monitor(self.args)
         # start tensorboard
-        p_tensorboard = start_tensorboard(self.args)
+        p_tensorboard = start_tensorboard(self.args)
         # start server
         p_server = start_server(self.args)
         wait_for_workers([p_monitor, p_tensorboard, p_server])

evalscope/config.py CHANGED Viewed

@@ -1,69 +1,137 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import copy
-from dataclasses import dataclass, asdict, field
-from typing import Optional, List
+import json
+import os
+from argparse import Namespace
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
-from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
+from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
 from evalscope.models.custom import CustomModel
-from evalscope.utils import yaml_to_dict
+from evalscope.utils import gen_hash
+from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
 from evalscope.utils.logger import get_logger
 logger = get_logger()
 cur_path = os.path.dirname(os.path.abspath(__file__))
-registry_tasks = {
-    'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
-    'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
-    'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
-    'cmmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/cmmlu.yaml')),
-    'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
-    'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
-    'general_qa': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/general_qa.yaml')),
-    # 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
-    # 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
-    # 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
+DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
+DEFAULT_GENERATION_CONFIG = {
+    'max_length': 2048,
+    'max_new_tokens': 512,
+    'do_sample': False,
+    'top_k': 50,
+    'top_p': 1.0,
+    'temperature': 1.0,
 }
 @dataclass
 class TaskConfig:
-    model_args: Optional[dict] = field(default_factory=dict)
-    template_type: Optional[str] = 'default-generation'
-    generation_config: Optional[dict] = field(default_factory=dict)
-    dataset_args: Optional[dict] = field(default_factory=dict)
+    # Model-related arguments
+    model: Union[str, CustomModel, None] = None
+    model_id: Optional[str] = None
+    model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
+    # Template-related arguments
+    template_type: Optional[str] = None  # Deprecated, will be removed in v1.0.0.
+    chat_template: Optional[str] = None
+    # Dataset-related arguments
+    datasets: Optional[List[str]] = None
+    dataset_args: Optional[Dict] = field(default_factory=dict)
+    dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
+    dataset_hub: str = HubType.MODELSCOPE
+    # Generation configuration arguments
+    generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
+    # Evaluation-related arguments
+    eval_type: str = EvalType.CHECKPOINT
+    eval_backend: str = EvalBackend.NATIVE
+    eval_config: Union[str, Dict, None] = None
+    stage: str = EvalStage.ALL
+    limit: Optional[int] = None
+    # Cache and working directory arguments
+    mem_cache: bool = False  # Deprecated, will be removed in v1.0.0.
+    use_cache: Optional[str] = None
+    work_dir: str = DEFAULT_WORK_DIR
+    outputs: Optional[str] = None  # Deprecated, will be removed in v1.0.0.
+    # Debug and runtime mode arguments
+    debug: bool = False
     dry_run: bool = False
-    model: CustomModel = None
-    eval_type: str = 'custom'
-    datasets: list = field(default_factory=list)
-    work_dir: str = DEFAULT_ROOT_CACHE_DIR
-    outputs: str = None
-    mem_cache: bool = False
-    use_cache: bool = True
-    stage: str = 'all'      # `all` or `infer` or `review`
-    dataset_hub: str = 'ModelScope'
-    dataset_dir: str = DEFAULT_ROOT_CACHE_DIR
-    limit: int = None
-    eval_backend: str = 'Native'
-    eval_config: dict = field(default_factory=dict)
-    # def __post_init__(self):
-    #     self.registry_tasks = {
-    #         'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
-    #         'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
-    #         'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
-    #         'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
-    #         'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
-    #
-    #         'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
-    #         'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
-    #         'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
-    #
-    #     }
+    seed: int = 42
+    def __post_init__(self):
+        if (not self.model_id) and self.model:
+            if isinstance(self.model, CustomModel):
+                self.model_id = type(self.model).__name__
+            else:
+                self.model_id = os.path.basename(self.model).rstrip(os.sep)
+    def to_dict(self):
+        # Note: to avoid serialization error for some model instance
+        return self.__dict__
+    def __str__(self):
+        return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
+    def update(self, other: Union['TaskConfig', dict]):
+        if isinstance(other, TaskConfig):
+            other = other.to_dict()
+        self.__dict__.update(other)
+    def dump_yaml(self, output_dir: str):
+        """Dump the task configuration to a YAML file."""
+        task_cfg_file = os.path.join(output_dir, f'task_config_{gen_hash(str(self), bits=6)}.yaml')
+        try:
+            logger.info(f'Dump task config to {task_cfg_file}')
+            dict_to_yaml(self.to_dict(), task_cfg_file)
+        except Exception as e:
+            logger.warning(f'Failed to dump overall task config: {e}')
+    @staticmethod
+    def list():
+        return list(registry_tasks.keys())
+    @staticmethod
+    def from_yaml(yaml_file: str):
+        return TaskConfig.from_dict(yaml_to_dict(yaml_file))
+    @staticmethod
+    def from_dict(d: dict):
+        return TaskConfig(**d)
+    @staticmethod
+    def from_json(json_file: str):
+        return TaskConfig.from_dict(json_to_dict(json_file))
+    @staticmethod
+    def from_args(args: Namespace):
+        # Convert Namespace to a dictionary and filter out None values
+        args_dict = {k: v for k, v in vars(args).items() if v is not None}
+        del args_dict['func']  # Note: compat CLI arguments
+        return TaskConfig.from_dict(args_dict)
+    @staticmethod
+    def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
+        res_list = []
+        for task_name in tasks:
+            task = registry_tasks.get(task_name, None)
+            if task is None:
+                logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
+                continue
+            task.model = custom_model
+            task.model_id = type(custom_model).__name__
+            res_list.append(task)
+        return res_list
     @staticmethod
     def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
@@ -75,7 +143,7 @@ class TaskConfig:
             data_pattern: str, the data pattern for the task.
                     e.g. `mmlu`, `ceval`, `gsm8k`, ...
                     refer to task_config.list() for all available datasets.
-            dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
+            dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
                 then your specific custom dataset directory will be /path/to/data/{name}
             subset_list: list, the subset list for the dataset.
                 e.g. ['middle_school_politics', 'operating_system']
@@ -83,63 +151,55 @@ class TaskConfig:
         """
         available_datasets = list(registry_tasks.keys())
         if data_pattern not in available_datasets:
-            logger.error(f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
+            logger.error(
+                f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
             return
         # Reuse the existing task config and update the datasets
-        pattern_config = registry_tasks.get(data_pattern)
+        pattern_config = registry_tasks[data_pattern]
         custom_config = copy.deepcopy(pattern_config)
-        custom_config.update({'datasets': [data_pattern]})
-        custom_config.update({'dataset_hub': 'Local'})     # TODO: to support `ModelScope`
-        if 'dataset_args' in custom_config:
-            if data_pattern not in custom_config:
-                custom_config['dataset_args'].update({data_pattern: {}})
-        else:
-            custom_config.update({'dataset_args': {data_pattern: {}}})
+        custom_config.datasets = [data_pattern]
+        custom_config.dataset_args = {data_pattern: {}}
+        custom_config.eval_type = EvalType.CHECKPOINT
         if dataset_dir is not None:
-            custom_config['dataset_args'][data_pattern].update({'local_path': dataset_dir})
+            custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
         if subset_list is not None:
-            # custom_config['dataset_args'].get(data_pattern, {}).update({'subset_list': subset_list})
-            custom_config['dataset_args'][data_pattern].update({'subset_list': subset_list})
+            custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
         registry_tasks.update({name: custom_config})
         logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
-    def to_dict(self):
-        # Note: to avoid serialization error for some model instance
-        _tmp_model = copy.copy(self.model)
-        self.model = None
-        res_dict = asdict(self)
-        res_dict.update({'model': _tmp_model})
-        self.model = _tmp_model
-        return res_dict
-    @staticmethod
-    def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
-        res_list = []
-        for task_name in tasks:
-            task: dict = registry_tasks.get(task_name, None)
-            if task is None:
-                logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
-                continue
-            res = TaskConfig(**task)
-            res.model = custom_model
-            if res.outputs is None:
-                res.outputs = os.path.join(res.work_dir,
-                                           'outputs',
-                                           f"eval_{'-'.join(tasks)}_{res.model.config['model_id']}_{res.model_args.get('revision', 'default')}")
-            res_list.append(res)
-        return res_list
-    @staticmethod
-    def list():
-        return list(registry_tasks.keys())
+tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
+registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
+def parse_task_config(task_cfg) -> TaskConfig:
+    """Parse task configuration from various formats into a TaskConfig object."""
+    if isinstance(task_cfg, TaskConfig):
+        logger.info('Args: Task config is provided with TaskConfig type.')
+    elif isinstance(task_cfg, dict):
+        logger.info('Args: Task config is provided with dictionary type.')
+        task_cfg = TaskConfig.from_dict(task_cfg)
+    elif isinstance(task_cfg, Namespace):
+        logger.info('Args: Task config is provided with CommandLine type.')
+        task_cfg = TaskConfig.from_args(task_cfg)
+    elif isinstance(task_cfg, str):
+        extension = task_cfg.split('.')[-1]
+        logger.info(f'Args: Task config is provided with {extension} file type.')
+        if extension in ['yaml', 'yml']:
+            task_cfg = TaskConfig.from_yaml(task_cfg)
+        elif extension == 'json':
+            task_cfg = TaskConfig.from_json(task_cfg)
+        else:
+            raise ValueError('Args: Unsupported file extension.')
+    else:
+        raise ValueError('Args: Please provide a valid task config.')
+    return task_cfg
 class TempModel(CustomModel):
@@ -158,9 +218,7 @@ if __name__ == '__main__':
     # Register a new task
     TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
-    import json
     swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
     for item in swift_eval_task:
-        print(item.to_dict())
+        print(item)
         print()

evalscope/constants.py CHANGED Viewed

@@ -1,7 +1,18 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from enum import Enum
+from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
+from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
-DEFAULT_ROOT_CACHE_DIR = '~/.cache/evalscope'
+DEFAULT_WORK_DIR = './outputs'
+DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION  # master
+DEFAULT_MODEL_CACHE_DIR = get_model_cache_root()  # ~/.cache/modelscope/hub
+DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root()  # ~/.cache/modelscope/datasets
+DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR  # compatible with old version
+class HubType:
+    MODELSCOPE = 'modelscope'
+    HUGGINGFACE = 'huggingface'
+    LOCAL = 'local'
 class DumpMode:
@@ -25,7 +36,7 @@ class MetricsConstant:
     ]
-class MetricMembers(Enum):
+class MetricMembers:
     # Math accuracy metric
     MATH_ACCURACY = 'math_accuracy'
@@ -65,54 +76,25 @@ class ArenaMode:
     PAIRWISE_BASELINE = 'pairwise_baseline'
-class OutputsStructure:
-    LOGS_DIR = 'logs_dir'
-    PREDICTIONS_DIR = 'predictions_dir'
-    REVIEWS_DIR = 'reviews_dir'
-    REPORTS_DIR = 'reports_dir'
-    CONFIGS_DIR = 'configs_dir'
 class AnswerKeys:
     ANSWER_ID = 'answer_id'
     RAW_INPUT = 'raw_input'
     ORIGIN_PROMPT = 'origin_prompt'
     MODEL_SPEC = 'model_spec'
     SUBSET_NAME = 'subset_name'
     CHOICES = 'choices'
 class ReviewKeys:
     REVIEW_ID = 'review_id'
     REVIEWED = 'reviewed'
     REVIEWER_SPEC = 'reviewer_spec'
     REVIEW_TIME = 'review_time'
     MESSAGE = 'message'
     CONTENT = 'content'
     GOLD = 'gold'
     PRED = 'pred'
     RESULT = 'result'
     REVIEW = 'review'
@@ -148,3 +130,39 @@ class EvalStage:
     ALL = 'all'
     INFER = 'infer'
     REVIEW = 'review'
+class EvalType:
+    CUSTOM = 'custom'
+    CHECKPOINT = 'checkpoint'
+class EvalBackend:
+    class _Backend:
+        #  compatible with old version, set 'value'
+        def __init__(self, value):
+            self._value = value
+        @property
+        def value(self):
+            return self._value
+        def __str__(self):
+            return self._value
+        def __repr__(self):
+            return f"'{self._value}'"
+        def __eq__(self, other):
+            if isinstance(other, str):
+                return self._value == other
+            return NotImplemented
+    NATIVE = _Backend('Native')
+    OPEN_COMPASS = _Backend('OpenCompass')
+    VLM_EVAL_KIT = _Backend('VLMEvalKit')
+    RAG_EVAL = _Backend('RAGEval')
+    THIRD_PARTY = _Backend('ThirdParty')

evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl