evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/cli/start_server.py
CHANGED
|
@@ -1,67 +1,56 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
|
-
from argparse import ArgumentParser
|
|
2
|
+
import os
|
|
4
3
|
import subprocess
|
|
5
|
-
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from argparse import ArgumentParser
|
|
6
7
|
|
|
7
8
|
from evalscope.cli.base import CLICommand
|
|
8
9
|
|
|
9
|
-
|
|
10
10
|
current_path = os.path.dirname(os.path.abspath(__file__))
|
|
11
11
|
print(current_path)
|
|
12
12
|
root_path = os.path.dirname(current_path)
|
|
13
13
|
print(root_path)
|
|
14
14
|
|
|
15
|
+
|
|
15
16
|
def subparser_func(args):
|
|
16
17
|
""" Function which will be called for a specific sub parser.
|
|
17
18
|
"""
|
|
18
19
|
return PerfServerCMD(args)
|
|
19
20
|
|
|
21
|
+
|
|
20
22
|
def add_perf_args(parser):
|
|
23
|
+
parser.add_argument('--server-command', required=True, type=str, help='The start server command.')
|
|
21
24
|
parser.add_argument(
|
|
22
|
-
'--
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
'--tensorboard-port', type=str, default='6006', help='The tensorboard port'
|
|
30
|
-
)
|
|
25
|
+
'--logdir',
|
|
26
|
+
required=True,
|
|
27
|
+
type=str,
|
|
28
|
+
help='The monitor log save dir, tensorboard start at this path for display!')
|
|
29
|
+
parser.add_argument('--host', type=str, default='0.0.0.0', help='The tensorboard host')
|
|
30
|
+
parser.add_argument('--tensorboard-port', type=str, default='6006', help='The tensorboard port')
|
|
31
|
+
|
|
31
32
|
|
|
32
33
|
def async_run_command_with_popen(cmd):
|
|
33
34
|
sub_process = subprocess.Popen(
|
|
34
|
-
cmd,
|
|
35
|
-
stdout=subprocess.PIPE,
|
|
36
|
-
stderr=subprocess.STDOUT,
|
|
37
|
-
bufsize=1,
|
|
38
|
-
universal_newlines=True,
|
|
39
|
-
encoding='utf8')
|
|
35
|
+
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8')
|
|
40
36
|
return sub_process
|
|
41
37
|
|
|
38
|
+
|
|
42
39
|
def start_monitor(args):
|
|
43
|
-
cmd = ['python',
|
|
44
|
-
'%s/perf/monitor.py'%root_path,
|
|
45
|
-
'--logdir',
|
|
46
|
-
args.logdir]
|
|
40
|
+
cmd = ['python', '%s/perf/monitor.py' % root_path, '--logdir', args.logdir]
|
|
47
41
|
print(cmd)
|
|
48
42
|
p = async_run_command_with_popen(cmd)
|
|
49
43
|
os.set_blocking(p.stdout.fileno(), False)
|
|
50
44
|
return p
|
|
51
45
|
|
|
46
|
+
|
|
52
47
|
def start_tensorboard(args):
|
|
53
|
-
cmd = ['tensorboard',
|
|
54
|
-
'--logdir',
|
|
55
|
-
args.logdir,
|
|
56
|
-
'--host',
|
|
57
|
-
args.host,
|
|
58
|
-
'--port',
|
|
59
|
-
args.tensorboard_port
|
|
60
|
-
]
|
|
48
|
+
cmd = ['tensorboard', '--logdir', args.logdir, '--host', args.host, '--port', args.tensorboard_port]
|
|
61
49
|
p = async_run_command_with_popen(cmd)
|
|
62
50
|
os.set_blocking(p.stdout.fileno(), False)
|
|
63
51
|
return p
|
|
64
52
|
|
|
53
|
+
|
|
65
54
|
def start_server(args):
|
|
66
55
|
cmd = args.server_command
|
|
67
56
|
print(cmd)
|
|
@@ -76,7 +65,7 @@ def start_server(args):
|
|
|
76
65
|
|
|
77
66
|
os.set_blocking(sub_process.stdout.fileno(), False)
|
|
78
67
|
return sub_process
|
|
79
|
-
|
|
68
|
+
|
|
80
69
|
|
|
81
70
|
def wait_for_workers(workers):
|
|
82
71
|
while True:
|
|
@@ -91,12 +80,12 @@ def wait_for_workers(workers):
|
|
|
91
80
|
else:
|
|
92
81
|
break
|
|
93
82
|
else:
|
|
94
|
-
print('Worker %s completed!'%idx)
|
|
83
|
+
print('Worker %s completed!' % idx)
|
|
95
84
|
for line in iter(worker.stdout.readline, ''):
|
|
96
85
|
if line != '':
|
|
97
86
|
sys.stdout.write(line)
|
|
98
87
|
else:
|
|
99
|
-
break
|
|
88
|
+
break
|
|
100
89
|
workers[idx] = None
|
|
101
90
|
|
|
102
91
|
is_all_completed = True
|
|
@@ -108,7 +97,8 @@ def wait_for_workers(workers):
|
|
|
108
97
|
if is_all_completed:
|
|
109
98
|
break
|
|
110
99
|
time.sleep(0.1)
|
|
111
|
-
|
|
100
|
+
|
|
101
|
+
|
|
112
102
|
class PerfServerCMD(CLICommand):
|
|
113
103
|
name = 'server'
|
|
114
104
|
|
|
@@ -127,12 +117,8 @@ class PerfServerCMD(CLICommand):
|
|
|
127
117
|
# start monitor
|
|
128
118
|
p_monitor = start_monitor(self.args)
|
|
129
119
|
# start tensorboard
|
|
130
|
-
p_tensorboard = start_tensorboard(self.args)
|
|
120
|
+
p_tensorboard = start_tensorboard(self.args)
|
|
131
121
|
# start server
|
|
132
122
|
p_server = start_server(self.args)
|
|
133
|
-
|
|
123
|
+
|
|
134
124
|
wait_for_workers([p_monitor, p_tensorboard, p_server])
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
evalscope/config.py
CHANGED
|
@@ -1,69 +1,137 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import copy
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from argparse import Namespace
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Dict, List, Optional, Union
|
|
7
9
|
|
|
8
|
-
from evalscope.constants import
|
|
10
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
|
|
9
11
|
from evalscope.models.custom import CustomModel
|
|
10
|
-
from evalscope.utils import
|
|
12
|
+
from evalscope.utils import gen_hash
|
|
13
|
+
from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
|
|
11
14
|
from evalscope.utils.logger import get_logger
|
|
12
15
|
|
|
13
16
|
logger = get_logger()
|
|
14
17
|
|
|
15
18
|
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
16
19
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
'
|
|
20
|
-
'
|
|
21
|
-
'
|
|
22
|
-
'
|
|
23
|
-
'
|
|
24
|
-
'
|
|
25
|
-
|
|
26
|
-
# 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
|
|
27
|
-
# 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
|
|
28
|
-
# 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
|
|
29
|
-
|
|
20
|
+
DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
|
|
21
|
+
DEFAULT_GENERATION_CONFIG = {
|
|
22
|
+
'max_length': 2048,
|
|
23
|
+
'max_new_tokens': 512,
|
|
24
|
+
'do_sample': False,
|
|
25
|
+
'top_k': 50,
|
|
26
|
+
'top_p': 1.0,
|
|
27
|
+
'temperature': 1.0,
|
|
30
28
|
}
|
|
31
29
|
|
|
32
30
|
|
|
33
31
|
@dataclass
|
|
34
32
|
class TaskConfig:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
33
|
+
# Model-related arguments
|
|
34
|
+
model: Union[str, CustomModel, None] = None
|
|
35
|
+
model_id: Optional[str] = None
|
|
36
|
+
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
37
|
+
|
|
38
|
+
# Template-related arguments
|
|
39
|
+
template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
40
|
+
chat_template: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
# Dataset-related arguments
|
|
43
|
+
datasets: Optional[List[str]] = None
|
|
44
|
+
dataset_args: Optional[Dict] = field(default_factory=dict)
|
|
45
|
+
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
46
|
+
dataset_hub: str = HubType.MODELSCOPE
|
|
47
|
+
|
|
48
|
+
# Generation configuration arguments
|
|
49
|
+
generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
|
|
50
|
+
|
|
51
|
+
# Evaluation-related arguments
|
|
52
|
+
eval_type: str = EvalType.CHECKPOINT
|
|
53
|
+
eval_backend: str = EvalBackend.NATIVE
|
|
54
|
+
eval_config: Union[str, Dict, None] = None
|
|
55
|
+
stage: str = EvalStage.ALL
|
|
56
|
+
limit: Optional[int] = None
|
|
57
|
+
|
|
58
|
+
# Cache and working directory arguments
|
|
59
|
+
mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
|
|
60
|
+
use_cache: Optional[str] = None
|
|
61
|
+
work_dir: str = DEFAULT_WORK_DIR
|
|
62
|
+
outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
63
|
+
|
|
64
|
+
# Debug and runtime mode arguments
|
|
65
|
+
debug: bool = False
|
|
39
66
|
dry_run: bool = False
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
+
seed: int = 42
|
|
68
|
+
|
|
69
|
+
def __post_init__(self):
|
|
70
|
+
if (not self.model_id) and self.model:
|
|
71
|
+
if isinstance(self.model, CustomModel):
|
|
72
|
+
self.model_id = type(self.model).__name__
|
|
73
|
+
else:
|
|
74
|
+
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
75
|
+
|
|
76
|
+
def to_dict(self):
|
|
77
|
+
# Note: to avoid serialization error for some model instance
|
|
78
|
+
return self.__dict__
|
|
79
|
+
|
|
80
|
+
def __str__(self):
|
|
81
|
+
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
82
|
+
|
|
83
|
+
def update(self, other: Union['TaskConfig', dict]):
|
|
84
|
+
if isinstance(other, TaskConfig):
|
|
85
|
+
other = other.to_dict()
|
|
86
|
+
self.__dict__.update(other)
|
|
87
|
+
|
|
88
|
+
def dump_yaml(self, output_dir: str):
|
|
89
|
+
"""Dump the task configuration to a YAML file."""
|
|
90
|
+
task_cfg_file = os.path.join(output_dir, f'task_config_{gen_hash(str(self), bits=6)}.yaml')
|
|
91
|
+
try:
|
|
92
|
+
logger.info(f'Dump task config to {task_cfg_file}')
|
|
93
|
+
dict_to_yaml(self.to_dict(), task_cfg_file)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning(f'Failed to dump overall task config: {e}')
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def list():
|
|
99
|
+
return list(registry_tasks.keys())
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def from_yaml(yaml_file: str):
|
|
103
|
+
return TaskConfig.from_dict(yaml_to_dict(yaml_file))
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def from_dict(d: dict):
|
|
107
|
+
return TaskConfig(**d)
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def from_json(json_file: str):
|
|
111
|
+
return TaskConfig.from_dict(json_to_dict(json_file))
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def from_args(args: Namespace):
|
|
115
|
+
# Convert Namespace to a dictionary and filter out None values
|
|
116
|
+
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
117
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
118
|
+
|
|
119
|
+
return TaskConfig.from_dict(args_dict)
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
|
|
123
|
+
res_list = []
|
|
124
|
+
for task_name in tasks:
|
|
125
|
+
task = registry_tasks.get(task_name, None)
|
|
126
|
+
if task is None:
|
|
127
|
+
logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
task.model = custom_model
|
|
131
|
+
task.model_id = type(custom_model).__name__
|
|
132
|
+
res_list.append(task)
|
|
133
|
+
|
|
134
|
+
return res_list
|
|
67
135
|
|
|
68
136
|
@staticmethod
|
|
69
137
|
def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
|
|
@@ -75,7 +143,7 @@ class TaskConfig:
|
|
|
75
143
|
data_pattern: str, the data pattern for the task.
|
|
76
144
|
e.g. `mmlu`, `ceval`, `gsm8k`, ...
|
|
77
145
|
refer to task_config.list() for all available datasets.
|
|
78
|
-
dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
|
|
146
|
+
dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
|
|
79
147
|
then your specific custom dataset directory will be /path/to/data/{name}
|
|
80
148
|
subset_list: list, the subset list for the dataset.
|
|
81
149
|
e.g. ['middle_school_politics', 'operating_system']
|
|
@@ -83,63 +151,55 @@ class TaskConfig:
|
|
|
83
151
|
"""
|
|
84
152
|
available_datasets = list(registry_tasks.keys())
|
|
85
153
|
if data_pattern not in available_datasets:
|
|
86
|
-
logger.error(
|
|
154
|
+
logger.error(
|
|
155
|
+
f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
|
|
87
156
|
return
|
|
88
157
|
|
|
89
158
|
# Reuse the existing task config and update the datasets
|
|
90
|
-
pattern_config = registry_tasks
|
|
159
|
+
pattern_config = registry_tasks[data_pattern]
|
|
91
160
|
|
|
92
161
|
custom_config = copy.deepcopy(pattern_config)
|
|
93
|
-
custom_config.
|
|
94
|
-
custom_config.
|
|
95
|
-
|
|
96
|
-
if data_pattern not in custom_config:
|
|
97
|
-
custom_config['dataset_args'].update({data_pattern: {}})
|
|
98
|
-
else:
|
|
99
|
-
custom_config.update({'dataset_args': {data_pattern: {}}})
|
|
162
|
+
custom_config.datasets = [data_pattern]
|
|
163
|
+
custom_config.dataset_args = {data_pattern: {}}
|
|
164
|
+
custom_config.eval_type = EvalType.CHECKPOINT
|
|
100
165
|
|
|
101
166
|
if dataset_dir is not None:
|
|
102
|
-
custom_config
|
|
167
|
+
custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
|
|
103
168
|
|
|
104
169
|
if subset_list is not None:
|
|
105
|
-
|
|
106
|
-
custom_config['dataset_args'][data_pattern].update({'subset_list': subset_list})
|
|
170
|
+
custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
|
|
107
171
|
|
|
108
172
|
registry_tasks.update({name: custom_config})
|
|
109
173
|
logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
|
|
110
174
|
|
|
111
|
-
def to_dict(self):
|
|
112
|
-
# Note: to avoid serialization error for some model instance
|
|
113
|
-
_tmp_model = copy.copy(self.model)
|
|
114
|
-
self.model = None
|
|
115
|
-
res_dict = asdict(self)
|
|
116
|
-
res_dict.update({'model': _tmp_model})
|
|
117
|
-
self.model = _tmp_model
|
|
118
|
-
|
|
119
|
-
return res_dict
|
|
120
175
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
176
|
+
tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
|
|
177
|
+
|
|
178
|
+
registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def parse_task_config(task_cfg) -> TaskConfig:
|
|
182
|
+
"""Parse task configuration from various formats into a TaskConfig object."""
|
|
183
|
+
if isinstance(task_cfg, TaskConfig):
|
|
184
|
+
logger.info('Args: Task config is provided with TaskConfig type.')
|
|
185
|
+
elif isinstance(task_cfg, dict):
|
|
186
|
+
logger.info('Args: Task config is provided with dictionary type.')
|
|
187
|
+
task_cfg = TaskConfig.from_dict(task_cfg)
|
|
188
|
+
elif isinstance(task_cfg, Namespace):
|
|
189
|
+
logger.info('Args: Task config is provided with CommandLine type.')
|
|
190
|
+
task_cfg = TaskConfig.from_args(task_cfg)
|
|
191
|
+
elif isinstance(task_cfg, str):
|
|
192
|
+
extension = task_cfg.split('.')[-1]
|
|
193
|
+
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
194
|
+
if extension in ['yaml', 'yml']:
|
|
195
|
+
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
196
|
+
elif extension == 'json':
|
|
197
|
+
task_cfg = TaskConfig.from_json(task_cfg)
|
|
198
|
+
else:
|
|
199
|
+
raise ValueError('Args: Unsupported file extension.')
|
|
200
|
+
else:
|
|
201
|
+
raise ValueError('Args: Please provide a valid task config.')
|
|
202
|
+
return task_cfg
|
|
143
203
|
|
|
144
204
|
|
|
145
205
|
class TempModel(CustomModel):
|
|
@@ -158,9 +218,7 @@ if __name__ == '__main__':
|
|
|
158
218
|
# Register a new task
|
|
159
219
|
TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
|
|
160
220
|
|
|
161
|
-
import json
|
|
162
221
|
swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
|
|
163
222
|
for item in swift_eval_task:
|
|
164
|
-
print(item
|
|
223
|
+
print(item)
|
|
165
224
|
print()
|
|
166
|
-
|
evalscope/constants.py
CHANGED
|
@@ -1,7 +1,18 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from
|
|
2
|
+
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
|
3
|
+
from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
DEFAULT_WORK_DIR = './outputs'
|
|
6
|
+
DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
|
|
7
|
+
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
|
|
8
|
+
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
|
|
9
|
+
DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HubType:
|
|
13
|
+
MODELSCOPE = 'modelscope'
|
|
14
|
+
HUGGINGFACE = 'huggingface'
|
|
15
|
+
LOCAL = 'local'
|
|
5
16
|
|
|
6
17
|
|
|
7
18
|
class DumpMode:
|
|
@@ -25,7 +36,7 @@ class MetricsConstant:
|
|
|
25
36
|
]
|
|
26
37
|
|
|
27
38
|
|
|
28
|
-
class MetricMembers
|
|
39
|
+
class MetricMembers:
|
|
29
40
|
|
|
30
41
|
# Math accuracy metric
|
|
31
42
|
MATH_ACCURACY = 'math_accuracy'
|
|
@@ -65,54 +76,25 @@ class ArenaMode:
|
|
|
65
76
|
PAIRWISE_BASELINE = 'pairwise_baseline'
|
|
66
77
|
|
|
67
78
|
|
|
68
|
-
class OutputsStructure:
|
|
69
|
-
|
|
70
|
-
LOGS_DIR = 'logs_dir'
|
|
71
|
-
|
|
72
|
-
PREDICTIONS_DIR = 'predictions_dir'
|
|
73
|
-
|
|
74
|
-
REVIEWS_DIR = 'reviews_dir'
|
|
75
|
-
|
|
76
|
-
REPORTS_DIR = 'reports_dir'
|
|
77
|
-
|
|
78
|
-
CONFIGS_DIR = 'configs_dir'
|
|
79
|
-
|
|
80
|
-
|
|
81
79
|
class AnswerKeys:
|
|
82
|
-
|
|
83
80
|
ANSWER_ID = 'answer_id'
|
|
84
|
-
|
|
85
81
|
RAW_INPUT = 'raw_input'
|
|
86
|
-
|
|
87
82
|
ORIGIN_PROMPT = 'origin_prompt'
|
|
88
|
-
|
|
89
83
|
MODEL_SPEC = 'model_spec'
|
|
90
|
-
|
|
91
84
|
SUBSET_NAME = 'subset_name'
|
|
92
|
-
|
|
93
85
|
CHOICES = 'choices'
|
|
94
86
|
|
|
95
87
|
|
|
96
88
|
class ReviewKeys:
|
|
97
|
-
|
|
98
89
|
REVIEW_ID = 'review_id'
|
|
99
|
-
|
|
100
90
|
REVIEWED = 'reviewed'
|
|
101
|
-
|
|
102
91
|
REVIEWER_SPEC = 'reviewer_spec'
|
|
103
|
-
|
|
104
92
|
REVIEW_TIME = 'review_time'
|
|
105
|
-
|
|
106
93
|
MESSAGE = 'message'
|
|
107
|
-
|
|
108
94
|
CONTENT = 'content'
|
|
109
|
-
|
|
110
95
|
GOLD = 'gold'
|
|
111
|
-
|
|
112
96
|
PRED = 'pred'
|
|
113
|
-
|
|
114
97
|
RESULT = 'result'
|
|
115
|
-
|
|
116
98
|
REVIEW = 'review'
|
|
117
99
|
|
|
118
100
|
|
|
@@ -148,3 +130,39 @@ class EvalStage:
|
|
|
148
130
|
ALL = 'all'
|
|
149
131
|
INFER = 'infer'
|
|
150
132
|
REVIEW = 'review'
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class EvalType:
|
|
136
|
+
|
|
137
|
+
CUSTOM = 'custom'
|
|
138
|
+
CHECKPOINT = 'checkpoint'
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class EvalBackend:
|
|
142
|
+
|
|
143
|
+
class _Backend:
|
|
144
|
+
# compatible with old version, set 'value'
|
|
145
|
+
|
|
146
|
+
def __init__(self, value):
|
|
147
|
+
self._value = value
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def value(self):
|
|
151
|
+
return self._value
|
|
152
|
+
|
|
153
|
+
def __str__(self):
|
|
154
|
+
return self._value
|
|
155
|
+
|
|
156
|
+
def __repr__(self):
|
|
157
|
+
return f"'{self._value}'"
|
|
158
|
+
|
|
159
|
+
def __eq__(self, other):
|
|
160
|
+
if isinstance(other, str):
|
|
161
|
+
return self._value == other
|
|
162
|
+
return NotImplemented
|
|
163
|
+
|
|
164
|
+
NATIVE = _Backend('Native')
|
|
165
|
+
OPEN_COMPASS = _Backend('OpenCompass')
|
|
166
|
+
VLM_EVAL_KIT = _Backend('VLMEvalKit')
|
|
167
|
+
RAG_EVAL = _Backend('RAGEval')
|
|
168
|
+
THIRD_PARTY = _Backend('ThirdParty')
|