PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/perf/plugin/api/dashscope_api.py CHANGED Viewed

@@ -1,8 +1,7 @@
+import json
 import os
 from typing import Any, Dict, Iterator, List
-import json
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase
 from evalscope.perf.plugin.registry import register_api

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -1,8 +1,7 @@
-import os
-from typing import Any, Dict, Iterator, List
 import json
+import os
 from transformers import AutoTokenizer
+from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase

evalscope/perf/plugin/datasets/base.py CHANGED Viewed

@@ -1,9 +1,8 @@
+import json
 import sys
 from abc import abstractmethod
 from typing import Any, Dict, Iterator, List, Tuple
-import json
 from evalscope.perf.arguments import Arguments

evalscope/perf/plugin/datasets/flickr8k.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import base64
 from io import BytesIO
-from typing import Any, Dict, Iterator, List
 from modelscope.msdatasets import MsDataset
 from PIL import Image
+from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.datasets.base import DatasetPluginBase

evalscope/perf/plugin/datasets/longalpaca.py CHANGED Viewed

@@ -1,6 +1,5 @@
-from typing import Any, Dict, Iterator, List
 from modelscope import MsDataset
+from typing import Any, Dict, Iterator, List
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.datasets.base import DatasetPluginBase

evalscope/perf/plugin/datasets/openqa.py CHANGED Viewed

@@ -1,8 +1,7 @@
+import json
 import subprocess
 from typing import Any, Dict, Iterator, List
-import json
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.datasets.base import DatasetPluginBase
 from evalscope.perf.plugin.registry import register_dataset

evalscope/perf/utils/analysis_result.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import base64
+import json
 import pickle
 import sqlite3
-import json
 result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
 con = sqlite3.connect(result_db_path)
 query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import time
+import torch
 from dataclasses import dataclass, field
 from typing import Any, List, Optional, Tuple
-import torch
 from evalscope.utils.logger import get_logger
 logger = get_logger()

evalscope/perf/utils/db_util.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import base64
+import json
 import os
 import pickle
 import sqlite3
 import sys
 from datetime import datetime
-import json
 from tabulate import tabulate
 from evalscope.perf.arguments import Arguments
@@ -88,15 +87,19 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
         cursor.execute(query, common_columns)
-def get_result_db_path(name, model):
+def get_output_path(args: Arguments) -> str:
     current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
-    output_dir = './outputs'
-    result_db_path = os.path.join(output_dir, f'{name or model}_perf', current_time, 'benchmark_data.db')
+    output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
+    if not os.path.exists(output_path):
+        os.makedirs(output_path, exist_ok=True)
+    logger.info(f'Save the result to: {output_path}')
+    return output_path
-    if not os.path.exists(os.path.dirname(result_db_path)):
-        os.makedirs(os.path.dirname(result_db_path), exist_ok=True)
+def get_result_db_path(args: Arguments):
+    result_db_path = os.path.join(args.outputs_dir, 'benchmark_data.db')
-    logger.info(f'Save the result to: {result_db_path}')
+    logger.info(f'Save the data base to: {result_db_path}')
     if os.path.exists(result_db_path):
         logger.warning('The db file exists, delete it and start again!.')
         sys.exit(1)

evalscope/perf/utils/local_server.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import os
 import subprocess
-from contextlib import asynccontextmanager
-from dataclasses import dataclass
 import torch
 import uvicorn
+from contextlib import asynccontextmanager
+from dataclasses import dataclass
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from sse_starlette.sse import EventSourceResponse
 from evalscope.perf.arguments import Arguments
-from evalscope.perf.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
+from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -66,9 +65,9 @@ async def lifespan(app: FastAPI):
         torch.cuda.empty_cache()
-def create_app(args) -> FastAPI:
+def create_app(model, attn_implementation=None) -> FastAPI:
     app = FastAPI(lifespan=lifespan)
-    chat_service = ChatService(model_path=args.model, attn_implementation=args.attn_implementation)
+    chat_service = ChatService(model_path=model, attn_implementation=attn_implementation)
     app.add_middleware(
         CORSMiddleware,
@@ -98,18 +97,25 @@ def create_app(args) -> FastAPI:
 def start_app(args: Arguments):
     if args.api == 'local':
-        app = create_app(args)
-        uvicorn.run(app, host='0.0.0.0', port=8877, workers=1)
+        app = create_app(args.model, args.attn_implementation)
+        uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
     elif args.api == 'local_vllm':
         os.environ['VLLM_USE_MODELSCOPE'] = 'True'
+        # yapf: disable
         proc = subprocess.Popen([
-            'python', '-m', 'vllm.entrypoints.openai.api_server', '--model', args.model, '--served-model-name',
-            os.path.basename(args.model), '--tensor-parallel-size',
-            str(torch.cuda.device_count()), '--max-model-len', '32768', '--gpu-memory-utilization', '0.9', '--host',
-            '0.0.0.0', '--port', '8877', '--disable-log-requests', '--disable-log-stats'
+            'python', '-m', 'vllm.entrypoints.openai.api_server',
+            '--model', args.model,
+            '--served-model-name', args.model,
+            '--tensor-parallel-size', str(torch.cuda.device_count()),
+            '--max-model-len', '32768',
+            '--gpu-memory-utilization', '0.9',
+            '--host', '0.0.0.0',
+            '--port', args.port,
+            '--disable-log-requests',
+            '--disable-log-stats',
         ])
+        # yapf: enable
         import atexit
         def on_exit():

evalscope/registry/config/cfg_arena_zhihu.yaml CHANGED Viewed

@@ -21,7 +21,7 @@ answers_gen:
         model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
         revision: NULL       # revision of model, default is NULL
         precision: torch.float16
-        enable: true           # enable or disable this model
+        enable: true           # enable or disable this model
         template_type: default-generation
         generation_config:
             do_sample: true

evalscope/registry/tasks/arc.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -22,8 +22,7 @@ model: null   # Note: to be implemented as CustomModel
 eval_type: custom
 datasets:
   - arc
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
+dataset_hub: modelscope    # `Local` or `ModelScope`
 limit: null

evalscope/registry/tasks/bbh.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null   # Note: to be implemented as CustomModel
 eval_type: custom
 datasets:
   - bbh
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope/registry/tasks/bbh_mini.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null   # Note: to be implemented as CustomModel
 eval_type: custom
 datasets:
   - bbh
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope/registry/tasks/ceval.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
 outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope/registry/tasks/ceval_mini.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null   # Note: to be implemented as CustomModel
 eval_type: custom
 datasets:
   - ceval
-outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope/registry/tasks/cmmlu.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
 outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml CHANGED Viewed

@@ -24,5 +24,5 @@ datasets:
 outputs: ./outputs/eval_qwen-7b-chat_v100    # Directory to save the outputs, structure: logs, predictions, reviews, reports
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
+dataset_hub: modelscope    # `Local` or `ModelScope`
 limit: 10

evalscope/registry/tasks/general_qa.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat

evalscope/registry/tasks/gsm8k.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -25,5 +25,5 @@ datasets:
 outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
+dataset_hub: modelscope    # `Local` or `ModelScope`
 limit: null

evalscope/registry/tasks/mmlu.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -25,5 +25,5 @@ datasets:
 outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: true
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope/registry/tasks/mmlu_mini.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 model_args:    # model args should be followed by benchmark requirements
-  revision: default
+  revision: master
   precision: torch.float16
   device_map: auto
 #  model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
 outputs: null    # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
 use_cache: false
 stage: all
-dataset_hub: ModelScope    # `Local` or `ModelScope`
-limit: null
+dataset_hub: modelscope    # `Local` or `ModelScope`
+limit: null

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl