evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/perf/arguments.py
CHANGED
|
@@ -1,18 +1,22 @@
|
|
|
1
1
|
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
2
4
|
import sys
|
|
3
5
|
from dataclasses import dataclass, field
|
|
4
6
|
from typing import Any, Dict, List, Optional
|
|
5
7
|
|
|
6
|
-
import
|
|
8
|
+
from evalscope.constants import DEFAULT_WORK_DIR
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
@dataclass
|
|
10
12
|
class Arguments:
|
|
11
13
|
# Model and API
|
|
12
|
-
model: str # Model
|
|
14
|
+
model: str # Model name or path
|
|
15
|
+
model_id: Optional[str] = None # Model identifier
|
|
13
16
|
attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
|
|
14
17
|
api: str = 'openai' # API to be used (default: 'openai')
|
|
15
18
|
tokenizer_path: Optional[str] = None # Path to the tokenizer
|
|
19
|
+
port: int = 8877 # Port number for the local API server
|
|
16
20
|
|
|
17
21
|
# Connection settings
|
|
18
22
|
url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
|
|
@@ -32,6 +36,9 @@ class Arguments:
|
|
|
32
36
|
wandb_api_key: Optional[str] = None # WandB API key for logging
|
|
33
37
|
name: Optional[str] = None # Name for the run
|
|
34
38
|
|
|
39
|
+
# Output settings
|
|
40
|
+
outputs_dir: str = DEFAULT_WORK_DIR
|
|
41
|
+
|
|
35
42
|
# Prompt settings
|
|
36
43
|
max_prompt_length: int = sys.maxsize # Maximum length of the prompt
|
|
37
44
|
min_prompt_length: int = 0 # Minimum length of the prompt
|
|
@@ -57,7 +64,6 @@ class Arguments:
|
|
|
57
64
|
|
|
58
65
|
@staticmethod
|
|
59
66
|
def from_args(args):
|
|
60
|
-
|
|
61
67
|
return Arguments(
|
|
62
68
|
model=args.model,
|
|
63
69
|
attn_implementation=args.attn_implementation,
|
|
@@ -72,6 +78,7 @@ class Arguments:
|
|
|
72
78
|
headers=args.headers,
|
|
73
79
|
wandb_api_key=args.wandb_api_key,
|
|
74
80
|
name=args.name,
|
|
81
|
+
outputs_dir=args.outputs_dir,
|
|
75
82
|
debug=args.debug,
|
|
76
83
|
tokenizer_path=args.tokenizer_path,
|
|
77
84
|
api=args.api,
|
|
@@ -98,6 +105,7 @@ class Arguments:
|
|
|
98
105
|
if self.api_key:
|
|
99
106
|
# Assuming the API key is used as a Bearer token
|
|
100
107
|
self.headers['Authorization'] = f'Bearer {self.api_key}'
|
|
108
|
+
self.model_id = os.path.basename(self.model)
|
|
101
109
|
|
|
102
110
|
def __str__(self):
|
|
103
111
|
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
@@ -130,6 +138,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
130
138
|
|
|
131
139
|
# Connection settings
|
|
132
140
|
parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
|
|
141
|
+
parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
|
|
133
142
|
parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
|
|
134
143
|
parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
|
|
135
144
|
parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
|
|
@@ -152,6 +161,9 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
152
161
|
parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
|
|
153
162
|
parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
|
|
154
163
|
|
|
164
|
+
# Output settings
|
|
165
|
+
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
|
166
|
+
|
|
155
167
|
# Dataset settings
|
|
156
168
|
parser.add_argument('--dataset', type=str, default='openqa', help='Specify the dataset')
|
|
157
169
|
parser.add_argument('--dataset-path', type=str, required=False, help='Path to the dataset file')
|
|
@@ -170,6 +182,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
170
182
|
parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
|
|
171
183
|
parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
|
|
172
184
|
parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
|
|
185
|
+
|
|
173
186
|
# yapf: enable
|
|
174
187
|
|
|
175
188
|
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import copy
|
|
3
|
+
import json
|
|
4
|
+
import numpy as np
|
|
3
5
|
import os
|
|
4
6
|
import platform
|
|
5
7
|
import sqlite3
|
|
6
8
|
import threading
|
|
7
9
|
import time
|
|
8
10
|
from http import HTTPStatus
|
|
9
|
-
from typing import List
|
|
10
|
-
|
|
11
|
-
import json
|
|
12
|
-
import numpy as np
|
|
13
11
|
from tqdm import tqdm
|
|
12
|
+
from typing import List
|
|
14
13
|
|
|
15
14
|
from evalscope.perf.arguments import Arguments
|
|
16
15
|
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
@@ -138,17 +137,17 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
138
137
|
api_plugin_class = ApiRegistry(args.api)
|
|
139
138
|
api_plugin = api_plugin_class(args.tokenizer_path)
|
|
140
139
|
|
|
141
|
-
result_db_path = get_result_db_path(args
|
|
140
|
+
result_db_path = get_result_db_path(args)
|
|
142
141
|
# Initialize wandb
|
|
143
142
|
if args.wandb_api_key:
|
|
144
|
-
import wandb
|
|
145
143
|
import datetime
|
|
144
|
+
import wandb
|
|
146
145
|
os.environ['WANDB_SILENT'] = 'true'
|
|
147
|
-
os.environ['WANDB_DIR'] =
|
|
146
|
+
os.environ['WANDB_DIR'] = args.outputs_dir
|
|
148
147
|
|
|
149
148
|
wandb.login(key=args.wandb_api_key)
|
|
150
149
|
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
151
|
-
name = args.name if args.name else f'{args.
|
|
150
|
+
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
152
151
|
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
153
152
|
|
|
154
153
|
with sqlite3.connect(result_db_path) as con:
|
|
@@ -196,10 +195,9 @@ async def start_server(args: Arguments) -> bool:
|
|
|
196
195
|
server.start()
|
|
197
196
|
|
|
198
197
|
if args.dataset.startswith('speed_benchmark'):
|
|
199
|
-
args.url = 'http://127.0.0.1:
|
|
198
|
+
args.url = f'http://127.0.0.1:{args.port}/v1/completions'
|
|
200
199
|
else:
|
|
201
|
-
args.url = 'http://127.0.0.1:
|
|
202
|
-
args.model = os.path.basename(args.model)
|
|
200
|
+
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
203
201
|
|
|
204
202
|
if not await test_connection(args):
|
|
205
203
|
raise TimeoutError('Test connection failed')
|
evalscope/perf/http_client.py
CHANGED
|
@@ -1,12 +1,10 @@
|
|
|
1
|
+
import aiohttp
|
|
1
2
|
import asyncio
|
|
2
|
-
import
|
|
3
|
+
import json
|
|
3
4
|
import time
|
|
4
5
|
from http import HTTPStatus
|
|
5
6
|
from typing import AsyncGenerator, Dict, List, Tuple
|
|
6
7
|
|
|
7
|
-
import aiohttp
|
|
8
|
-
import json
|
|
9
|
-
|
|
10
8
|
from evalscope.perf.arguments import Arguments
|
|
11
9
|
from evalscope.perf.utils.local_server import ServerSentEvent
|
|
12
10
|
from evalscope.utils.logger import get_logger
|
|
@@ -21,7 +19,6 @@ class AioHttpClient:
|
|
|
21
19
|
args: Arguments,
|
|
22
20
|
):
|
|
23
21
|
self.url = args.url
|
|
24
|
-
self.debug = args.debug
|
|
25
22
|
self.headers = {'user-agent': 'modelscope_bench', **(args.headers or {})}
|
|
26
23
|
self.read_timeout = args.read_timeout
|
|
27
24
|
self.connect_timeout = args.connect_timeout
|
|
@@ -31,9 +28,7 @@ class AioHttpClient:
|
|
|
31
28
|
connect=self.connect_timeout,
|
|
32
29
|
sock_read=self.read_timeout),
|
|
33
30
|
connector=aiohttp.TCPConnector(limit=1),
|
|
34
|
-
trace_configs=[self._create_trace_config()] if
|
|
35
|
-
if self.debug:
|
|
36
|
-
get_logger(log_level=logging.DEBUG)
|
|
31
|
+
trace_configs=[self._create_trace_config()] if args.debug else [])
|
|
37
32
|
|
|
38
33
|
def _create_trace_config(self):
|
|
39
34
|
trace_config = aiohttp.TraceConfig()
|
evalscope/perf/main.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
2
4
|
import platform
|
|
3
5
|
from argparse import Namespace
|
|
4
6
|
|
|
5
7
|
from evalscope.perf.arguments import Arguments, parse_args
|
|
6
8
|
from evalscope.perf.benchmark import benchmark
|
|
9
|
+
from evalscope.perf.utils.db_util import get_output_path
|
|
7
10
|
from evalscope.perf.utils.handler import add_signal_handlers
|
|
8
|
-
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.logger import configure_logging, get_logger
|
|
9
12
|
from evalscope.utils.utils import seed_everything
|
|
10
13
|
|
|
11
14
|
logger = get_logger()
|
|
@@ -18,6 +21,10 @@ def run_perf_benchmark(args):
|
|
|
18
21
|
args = Arguments.from_args(args)
|
|
19
22
|
seed_everything(args.seed)
|
|
20
23
|
|
|
24
|
+
# Setup logger and output
|
|
25
|
+
args.outputs_dir = get_output_path(args)
|
|
26
|
+
configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
|
|
27
|
+
|
|
21
28
|
logger.info('Starting benchmark...')
|
|
22
29
|
logger.info(args)
|
|
23
30
|
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Any, Dict, Iterator, List
|
|
3
|
-
|
|
4
1
|
import json
|
|
2
|
+
import os
|
|
5
3
|
from transformers import AutoTokenizer
|
|
4
|
+
from typing import Any, Dict, Iterator, List, Union
|
|
6
5
|
|
|
7
6
|
from evalscope.perf.arguments import Arguments
|
|
8
7
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
@@ -30,7 +29,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
30
29
|
else:
|
|
31
30
|
self.tokenizer = None
|
|
32
31
|
|
|
33
|
-
def build_request(self, messages: List[Dict]
|
|
32
|
+
def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
|
|
34
33
|
"""Build the openai format request based on prompt, dataset
|
|
35
34
|
|
|
36
35
|
Args:
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
from io import BytesIO
|
|
3
|
-
from typing import Any, Dict, Iterator, List
|
|
4
|
-
|
|
5
3
|
from modelscope.msdatasets import MsDataset
|
|
6
4
|
from PIL import Image
|
|
5
|
+
from typing import Any, Dict, Iterator, List
|
|
7
6
|
|
|
8
7
|
from evalscope.perf.arguments import Arguments
|
|
9
8
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import subprocess
|
|
2
3
|
from typing import Any, Dict, Iterator, List
|
|
3
4
|
|
|
4
|
-
import json
|
|
5
|
-
|
|
6
5
|
from evalscope.perf.arguments import Arguments
|
|
7
6
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
8
7
|
from evalscope.perf.plugin.registry import register_dataset
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, List, Type
|
|
1
|
+
from typing import Any, List, Type, Union
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class PluginRegistry:
|
|
@@ -20,7 +20,7 @@ class PluginRegistry:
|
|
|
20
20
|
return self.get_class(name)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def register_dataset(name: str
|
|
23
|
+
def register_dataset(name: Union[str, List[str]]):
|
|
24
24
|
|
|
25
25
|
def class_decorator(cls: Type):
|
|
26
26
|
if isinstance(name, str):
|
|
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
|
|
|
35
35
|
return class_decorator
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def register_api(name: str
|
|
38
|
+
def register_api(name: Union[str, List[str]]):
|
|
39
39
|
|
|
40
40
|
def class_decorator(cls: Type):
|
|
41
41
|
if isinstance(name, str):
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import json
|
|
2
3
|
import pickle
|
|
3
4
|
import sqlite3
|
|
4
5
|
|
|
5
|
-
import json
|
|
6
|
-
|
|
7
6
|
result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
|
|
8
7
|
con = sqlite3.connect(result_db_path)
|
|
9
8
|
query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import time
|
|
2
|
+
import torch
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from typing import Any, List, Optional, Tuple
|
|
4
5
|
|
|
5
|
-
import torch
|
|
6
|
-
|
|
7
6
|
from evalscope.utils.logger import get_logger
|
|
8
7
|
|
|
9
8
|
logger = get_logger()
|
|
@@ -117,19 +116,19 @@ class BenchmarkMetrics:
|
|
|
117
116
|
|
|
118
117
|
def create_message(self, default_ndigits=3):
|
|
119
118
|
message = {
|
|
120
|
-
'Time taken for tests (
|
|
119
|
+
'Time taken for tests (s)': round(self.total_time, default_ndigits),
|
|
121
120
|
'Number of concurrency': self.concurrency,
|
|
122
121
|
'Total requests': int(self.n_total_queries),
|
|
123
122
|
'Succeed requests': self.n_succeed_queries,
|
|
124
123
|
'Failed requests': self.n_failed_queries,
|
|
124
|
+
'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
|
|
125
125
|
'Average QPS': round(self.qps, default_ndigits),
|
|
126
126
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
127
127
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
128
128
|
'Average time per output token (s)': round(self.avg_time_per_token, 5),
|
|
129
|
-
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
|
130
|
-
'Average package per request': round(self.n_avg_chunks, default_ndigits),
|
|
131
|
-
'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
|
|
132
129
|
'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
|
|
133
130
|
'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
|
|
131
|
+
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
|
132
|
+
'Average package per request': round(self.n_avg_chunks, default_ndigits),
|
|
134
133
|
}
|
|
135
134
|
return message
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import json
|
|
2
3
|
import os
|
|
3
4
|
import pickle
|
|
4
5
|
import sqlite3
|
|
5
6
|
import sys
|
|
6
7
|
from datetime import datetime
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
8
|
from tabulate import tabulate
|
|
9
|
+
from typing import Dict, List
|
|
10
10
|
|
|
11
11
|
from evalscope.perf.arguments import Arguments
|
|
12
12
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
@@ -88,15 +88,19 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
88
88
|
cursor.execute(query, common_columns)
|
|
89
89
|
|
|
90
90
|
|
|
91
|
-
def
|
|
91
|
+
def get_output_path(args: Arguments) -> str:
|
|
92
92
|
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
93
|
-
|
|
94
|
-
|
|
93
|
+
output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
|
|
94
|
+
if not os.path.exists(output_path):
|
|
95
|
+
os.makedirs(output_path, exist_ok=True)
|
|
96
|
+
logger.info(f'Save the result to: {output_path}')
|
|
97
|
+
return output_path
|
|
98
|
+
|
|
95
99
|
|
|
96
|
-
|
|
97
|
-
|
|
100
|
+
def get_result_db_path(args: Arguments):
|
|
101
|
+
result_db_path = os.path.join(args.outputs_dir, 'benchmark_data.db')
|
|
98
102
|
|
|
99
|
-
logger.info(f'Save the
|
|
103
|
+
logger.info(f'Save the data base to: {result_db_path}')
|
|
100
104
|
if os.path.exists(result_db_path):
|
|
101
105
|
logger.warning('The db file exists, delete it and start again!.')
|
|
102
106
|
sys.exit(1)
|
|
@@ -104,44 +108,87 @@ def get_result_db_path(name, model):
|
|
|
104
108
|
return result_db_path
|
|
105
109
|
|
|
106
110
|
|
|
107
|
-
def
|
|
111
|
+
def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
|
|
112
|
+
"""
|
|
113
|
+
Calculate the percentiles for a specific list of data.
|
|
108
114
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
115
|
+
:param data: List of values for a specific metric.
|
|
116
|
+
:param percentiles: List of percentiles to calculate.
|
|
117
|
+
:return: Dictionary of calculated percentiles.
|
|
118
|
+
"""
|
|
119
|
+
results = {}
|
|
120
|
+
n_success_queries = len(data)
|
|
121
|
+
data.sort()
|
|
122
|
+
for percentile in percentiles:
|
|
123
|
+
try:
|
|
113
124
|
idx = int(n_success_queries * percentile / 100)
|
|
114
|
-
|
|
115
|
-
value = row[index] if row[index] is not None else float('inf')
|
|
125
|
+
value = data[idx] if data[idx] is not None else float('nan')
|
|
116
126
|
results[percentile] = round(value, 4)
|
|
117
|
-
|
|
127
|
+
except IndexError:
|
|
128
|
+
results[percentile] = float('nan')
|
|
129
|
+
return results
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
133
|
+
"""
|
|
134
|
+
Compute and return quantiles for various metrics from the database results.
|
|
135
|
+
|
|
136
|
+
:param result_db_path: Path to the SQLite database file.
|
|
137
|
+
:return: Dictionary of percentiles for various metrics.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
def inter_token_latencies(chunk_times_json: str) -> List[float]:
|
|
141
|
+
try:
|
|
142
|
+
chunk_times = json.loads(chunk_times_json)
|
|
143
|
+
return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
|
|
144
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
145
|
+
logger.error(f'Error parsing chunk times: {e}')
|
|
146
|
+
return []
|
|
118
147
|
|
|
119
148
|
query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
|
|
120
149
|
'n_chunks, chunk_time, prompt_tokens, completion_tokens '
|
|
121
|
-
'FROM result WHERE success=1
|
|
150
|
+
'FROM result WHERE success=1')
|
|
151
|
+
|
|
122
152
|
percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
|
|
123
153
|
|
|
124
154
|
with sqlite3.connect(result_db_path) as con:
|
|
125
155
|
rows = con.execute(query_sql).fetchall()
|
|
126
156
|
|
|
127
|
-
if len(rows)
|
|
157
|
+
if len(rows) < len(percentiles):
|
|
128
158
|
logger.info('Too little data to calculate quantiles!')
|
|
129
159
|
return {}
|
|
130
160
|
|
|
131
|
-
#
|
|
132
|
-
|
|
133
|
-
|
|
161
|
+
# Define index variables for columns
|
|
162
|
+
CHUNK_TIMES_INDEX = 1
|
|
163
|
+
LATENCY_INDEX = 4
|
|
164
|
+
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
165
|
+
PROMPT_TOKENS_INDEX = 8
|
|
166
|
+
COMPLETION_TOKENS_INDEX = 9
|
|
167
|
+
|
|
168
|
+
# Prepare data for each metric
|
|
169
|
+
inter_token_latencies_all = []
|
|
170
|
+
for row in rows:
|
|
171
|
+
inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
|
|
172
|
+
|
|
173
|
+
metrics = {
|
|
174
|
+
'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
175
|
+
'TPOT (s)':
|
|
176
|
+
inter_token_latencies_all,
|
|
177
|
+
'Latency (s)': [row[LATENCY_INDEX] for row in rows],
|
|
178
|
+
'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
179
|
+
'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
180
|
+
'Throughput(tokens/s)':
|
|
181
|
+
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
182
|
+
for row in rows]
|
|
183
|
+
}
|
|
134
184
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
185
|
+
# Calculate percentiles for each metric
|
|
186
|
+
results = {'Percentile': [f'{p}%' for p in percentiles]}
|
|
187
|
+
for metric_name, data in metrics.items():
|
|
188
|
+
metric_percentiles = calculate_percentiles(data, percentiles)
|
|
189
|
+
results[metric_name] = [metric_percentiles[p] for p in percentiles]
|
|
138
190
|
|
|
139
|
-
|
|
140
|
-
return {
|
|
141
|
-
'Percentile': [f'{p}%' for p in percentiles],
|
|
142
|
-
'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
|
|
143
|
-
'Latency (s)': [latency_results[p] for p in percentiles]
|
|
144
|
-
}
|
|
191
|
+
return results
|
|
145
192
|
|
|
146
193
|
|
|
147
194
|
def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
|
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import subprocess
|
|
3
|
-
from contextlib import asynccontextmanager
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
|
|
6
3
|
import torch
|
|
7
4
|
import uvicorn
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
from dataclasses import dataclass
|
|
8
7
|
from fastapi import FastAPI
|
|
9
8
|
from fastapi.middleware.cors import CORSMiddleware
|
|
10
9
|
from sse_starlette.sse import EventSourceResponse
|
|
11
10
|
|
|
12
11
|
from evalscope.perf.arguments import Arguments
|
|
13
|
-
from evalscope.
|
|
12
|
+
from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
|
|
14
13
|
from evalscope.utils.logger import get_logger
|
|
15
14
|
|
|
16
15
|
logger = get_logger()
|
|
@@ -66,9 +65,9 @@ async def lifespan(app: FastAPI):
|
|
|
66
65
|
torch.cuda.empty_cache()
|
|
67
66
|
|
|
68
67
|
|
|
69
|
-
def create_app(
|
|
68
|
+
def create_app(model, attn_implementation=None) -> FastAPI:
|
|
70
69
|
app = FastAPI(lifespan=lifespan)
|
|
71
|
-
chat_service = ChatService(model_path=
|
|
70
|
+
chat_service = ChatService(model_path=model, attn_implementation=attn_implementation)
|
|
72
71
|
|
|
73
72
|
app.add_middleware(
|
|
74
73
|
CORSMiddleware,
|
|
@@ -98,18 +97,27 @@ def create_app(args) -> FastAPI:
|
|
|
98
97
|
|
|
99
98
|
def start_app(args: Arguments):
|
|
100
99
|
if args.api == 'local':
|
|
101
|
-
app = create_app(args)
|
|
102
|
-
uvicorn.run(app, host='0.0.0.0', port=
|
|
100
|
+
app = create_app(args.model, args.attn_implementation)
|
|
101
|
+
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
103
102
|
|
|
104
103
|
elif args.api == 'local_vllm':
|
|
105
104
|
os.environ['VLLM_USE_MODELSCOPE'] = 'True'
|
|
106
|
-
|
|
105
|
+
os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
|
|
106
|
+
# yapf: disable
|
|
107
107
|
proc = subprocess.Popen([
|
|
108
|
-
'python', '-m', 'vllm.entrypoints.openai.api_server',
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
'
|
|
108
|
+
'python', '-m', 'vllm.entrypoints.openai.api_server',
|
|
109
|
+
'--model', args.model,
|
|
110
|
+
'--served-model-name', args.model,
|
|
111
|
+
'--tensor-parallel-size', str(torch.cuda.device_count()),
|
|
112
|
+
'--max-model-len', '32768',
|
|
113
|
+
'--gpu-memory-utilization', '0.9',
|
|
114
|
+
'--host', '0.0.0.0',
|
|
115
|
+
'--port', str(args.port),
|
|
116
|
+
'--trust-remote-code',
|
|
117
|
+
'--disable-log-requests',
|
|
118
|
+
'--disable-log-stats',
|
|
112
119
|
])
|
|
120
|
+
# yapf: enable
|
|
113
121
|
import atexit
|
|
114
122
|
|
|
115
123
|
def on_exit():
|
|
@@ -21,7 +21,7 @@ answers_gen:
|
|
|
21
21
|
model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
|
|
22
22
|
revision: NULL # revision of model, default is NULL
|
|
23
23
|
precision: torch.float16
|
|
24
|
-
enable: true # enable or disable this model
|
|
24
|
+
enable: true # enable or disable this model
|
|
25
25
|
template_type: default-generation
|
|
26
26
|
generation_config:
|
|
27
27
|
do_sample: true
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -22,8 +22,7 @@ model: null # Note: to be implemented as CustomModel
|
|
|
22
22
|
eval_type: custom
|
|
23
23
|
datasets:
|
|
24
24
|
- arc
|
|
25
|
-
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
26
25
|
use_cache: false
|
|
27
26
|
stage: all
|
|
28
|
-
dataset_hub:
|
|
27
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
29
28
|
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
|
|
|
20
20
|
eval_type: custom
|
|
21
21
|
datasets:
|
|
22
22
|
- bbh
|
|
23
|
-
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
23
|
use_cache: false
|
|
25
24
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
25
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
26
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
|
|
|
20
20
|
eval_type: custom
|
|
21
21
|
datasets:
|
|
22
22
|
- bbh
|
|
23
|
-
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
23
|
use_cache: false
|
|
25
24
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
25
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
26
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -23,5 +23,5 @@ datasets:
|
|
|
23
23
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
24
|
use_cache: false
|
|
25
25
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
26
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
27
|
+
limit: null
|