evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +2 -3
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Any, Dict, Iterator, List
|
|
3
|
-
|
|
4
1
|
import json
|
|
2
|
+
import os
|
|
5
3
|
from transformers import AutoTokenizer
|
|
4
|
+
from typing import Any, Dict, Iterator, List
|
|
6
5
|
|
|
7
6
|
from evalscope.perf.arguments import Arguments
|
|
8
7
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
from io import BytesIO
|
|
3
|
-
from typing import Any, Dict, Iterator, List
|
|
4
|
-
|
|
5
3
|
from modelscope.msdatasets import MsDataset
|
|
6
4
|
from PIL import Image
|
|
5
|
+
from typing import Any, Dict, Iterator, List
|
|
7
6
|
|
|
8
7
|
from evalscope.perf.arguments import Arguments
|
|
9
8
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import subprocess
|
|
2
3
|
from typing import Any, Dict, Iterator, List
|
|
3
4
|
|
|
4
|
-
import json
|
|
5
|
-
|
|
6
5
|
from evalscope.perf.arguments import Arguments
|
|
7
6
|
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
8
7
|
from evalscope.perf.plugin.registry import register_dataset
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import json
|
|
2
3
|
import pickle
|
|
3
4
|
import sqlite3
|
|
4
5
|
|
|
5
|
-
import json
|
|
6
|
-
|
|
7
6
|
result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
|
|
8
7
|
con = sqlite3.connect(result_db_path)
|
|
9
8
|
query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import json
|
|
2
3
|
import os
|
|
3
4
|
import pickle
|
|
4
5
|
import sqlite3
|
|
5
6
|
import sys
|
|
6
7
|
from datetime import datetime
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
8
|
from tabulate import tabulate
|
|
10
9
|
|
|
11
10
|
from evalscope.perf.arguments import Arguments
|
|
@@ -88,15 +87,19 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
88
87
|
cursor.execute(query, common_columns)
|
|
89
88
|
|
|
90
89
|
|
|
91
|
-
def
|
|
90
|
+
def get_output_path(args: Arguments) -> str:
|
|
92
91
|
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
|
|
93
|
+
if not os.path.exists(output_path):
|
|
94
|
+
os.makedirs(output_path, exist_ok=True)
|
|
95
|
+
logger.info(f'Save the result to: {output_path}')
|
|
96
|
+
return output_path
|
|
97
|
+
|
|
95
98
|
|
|
96
|
-
|
|
97
|
-
|
|
99
|
+
def get_result_db_path(args: Arguments):
|
|
100
|
+
result_db_path = os.path.join(args.outputs_dir, 'benchmark_data.db')
|
|
98
101
|
|
|
99
|
-
logger.info(f'Save the
|
|
102
|
+
logger.info(f'Save the data base to: {result_db_path}')
|
|
100
103
|
if os.path.exists(result_db_path):
|
|
101
104
|
logger.warning('The db file exists, delete it and start again!.')
|
|
102
105
|
sys.exit(1)
|
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import subprocess
|
|
3
|
-
from contextlib import asynccontextmanager
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
|
|
6
3
|
import torch
|
|
7
4
|
import uvicorn
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
from dataclasses import dataclass
|
|
8
7
|
from fastapi import FastAPI
|
|
9
8
|
from fastapi.middleware.cors import CORSMiddleware
|
|
10
9
|
from sse_starlette.sse import EventSourceResponse
|
|
11
10
|
|
|
12
11
|
from evalscope.perf.arguments import Arguments
|
|
13
|
-
from evalscope.
|
|
12
|
+
from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
|
|
14
13
|
from evalscope.utils.logger import get_logger
|
|
15
14
|
|
|
16
15
|
logger = get_logger()
|
|
@@ -66,9 +65,9 @@ async def lifespan(app: FastAPI):
|
|
|
66
65
|
torch.cuda.empty_cache()
|
|
67
66
|
|
|
68
67
|
|
|
69
|
-
def create_app(
|
|
68
|
+
def create_app(model, attn_implementation=None) -> FastAPI:
|
|
70
69
|
app = FastAPI(lifespan=lifespan)
|
|
71
|
-
chat_service = ChatService(model_path=
|
|
70
|
+
chat_service = ChatService(model_path=model, attn_implementation=attn_implementation)
|
|
72
71
|
|
|
73
72
|
app.add_middleware(
|
|
74
73
|
CORSMiddleware,
|
|
@@ -98,18 +97,25 @@ def create_app(args) -> FastAPI:
|
|
|
98
97
|
|
|
99
98
|
def start_app(args: Arguments):
|
|
100
99
|
if args.api == 'local':
|
|
101
|
-
app = create_app(args)
|
|
102
|
-
uvicorn.run(app, host='0.0.0.0', port=
|
|
100
|
+
app = create_app(args.model, args.attn_implementation)
|
|
101
|
+
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
103
102
|
|
|
104
103
|
elif args.api == 'local_vllm':
|
|
105
104
|
os.environ['VLLM_USE_MODELSCOPE'] = 'True'
|
|
106
|
-
|
|
105
|
+
# yapf: disable
|
|
107
106
|
proc = subprocess.Popen([
|
|
108
|
-
'python', '-m', 'vllm.entrypoints.openai.api_server',
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
'
|
|
107
|
+
'python', '-m', 'vllm.entrypoints.openai.api_server',
|
|
108
|
+
'--model', args.model,
|
|
109
|
+
'--served-model-name', args.model,
|
|
110
|
+
'--tensor-parallel-size', str(torch.cuda.device_count()),
|
|
111
|
+
'--max-model-len', '32768',
|
|
112
|
+
'--gpu-memory-utilization', '0.9',
|
|
113
|
+
'--host', '0.0.0.0',
|
|
114
|
+
'--port', args.port,
|
|
115
|
+
'--disable-log-requests',
|
|
116
|
+
'--disable-log-stats',
|
|
112
117
|
])
|
|
118
|
+
# yapf: enable
|
|
113
119
|
import atexit
|
|
114
120
|
|
|
115
121
|
def on_exit():
|
|
@@ -21,7 +21,7 @@ answers_gen:
|
|
|
21
21
|
model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
|
|
22
22
|
revision: NULL # revision of model, default is NULL
|
|
23
23
|
precision: torch.float16
|
|
24
|
-
enable: true # enable or disable this model
|
|
24
|
+
enable: true # enable or disable this model
|
|
25
25
|
template_type: default-generation
|
|
26
26
|
generation_config:
|
|
27
27
|
do_sample: true
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -22,8 +22,7 @@ model: null # Note: to be implemented as CustomModel
|
|
|
22
22
|
eval_type: custom
|
|
23
23
|
datasets:
|
|
24
24
|
- arc
|
|
25
|
-
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
26
25
|
use_cache: false
|
|
27
26
|
stage: all
|
|
28
|
-
dataset_hub:
|
|
27
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
29
28
|
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
|
|
|
20
20
|
eval_type: custom
|
|
21
21
|
datasets:
|
|
22
22
|
- bbh
|
|
23
|
-
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
23
|
use_cache: false
|
|
25
24
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
25
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
26
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
|
|
|
20
20
|
eval_type: custom
|
|
21
21
|
datasets:
|
|
22
22
|
- bbh
|
|
23
|
-
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
23
|
use_cache: false
|
|
25
24
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
25
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
26
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -23,5 +23,5 @@ datasets:
|
|
|
23
23
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
24
|
use_cache: false
|
|
25
25
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
26
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
27
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
|
|
|
20
20
|
eval_type: custom
|
|
21
21
|
datasets:
|
|
22
22
|
- ceval
|
|
23
|
-
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
23
|
use_cache: false
|
|
25
24
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
25
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
26
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -23,5 +23,5 @@ datasets:
|
|
|
23
23
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
24
|
use_cache: false
|
|
25
25
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
26
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
27
|
+
limit: null
|
|
@@ -24,5 +24,5 @@ datasets:
|
|
|
24
24
|
outputs: ./outputs/eval_qwen-7b-chat_v100 # Directory to save the outputs, structure: logs, predictions, reviews, reports
|
|
25
25
|
use_cache: false
|
|
26
26
|
stage: all
|
|
27
|
-
dataset_hub:
|
|
27
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
28
28
|
limit: 10
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -25,5 +25,5 @@ datasets:
|
|
|
25
25
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
26
26
|
use_cache: false
|
|
27
27
|
stage: all
|
|
28
|
-
dataset_hub:
|
|
28
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
29
29
|
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -25,5 +25,5 @@ datasets:
|
|
|
25
25
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
26
26
|
use_cache: true
|
|
27
27
|
stage: all
|
|
28
|
-
dataset_hub:
|
|
29
|
-
limit: null
|
|
28
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
29
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -23,5 +23,5 @@ datasets:
|
|
|
23
23
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
24
|
use_cache: false
|
|
25
25
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
26
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
27
|
+
limit: null
|