evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
1
+ import json
1
2
  import os
2
3
  from typing import Any, Dict, Iterator, List
3
4
 
4
- import json
5
-
6
5
  from evalscope.perf.arguments import Arguments
7
6
  from evalscope.perf.plugin.api.base import ApiPluginBase
8
7
  from evalscope.perf.plugin.registry import register_api
@@ -1,8 +1,7 @@
1
- import os
2
- from typing import Any, Dict, Iterator, List
3
-
4
1
  import json
2
+ import os
5
3
  from transformers import AutoTokenizer
4
+ from typing import Any, Dict, Iterator, List
6
5
 
7
6
  from evalscope.perf.arguments import Arguments
8
7
  from evalscope.perf.plugin.api.base import ApiPluginBase
@@ -151,6 +150,6 @@ class OpenaiPlugin(ApiPluginBase):
151
150
  elif input_tokens is None and output_tokens is None: # no usage info get.
152
151
  input_tokens = 0
153
152
  output_tokens = 0
154
- logger.warning('No usage info get.')
153
+ logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
155
154
 
156
155
  return input_tokens, output_tokens
@@ -1,9 +1,8 @@
1
+ import json
1
2
  import sys
2
3
  from abc import abstractmethod
3
4
  from typing import Any, Dict, Iterator, List, Tuple
4
5
 
5
- import json
6
-
7
6
  from evalscope.perf.arguments import Arguments
8
7
 
9
8
 
@@ -1,9 +1,8 @@
1
1
  import base64
2
2
  from io import BytesIO
3
- from typing import Any, Dict, Iterator, List
4
-
5
3
  from modelscope.msdatasets import MsDataset
6
4
  from PIL import Image
5
+ from typing import Any, Dict, Iterator, List
7
6
 
8
7
  from evalscope.perf.arguments import Arguments
9
8
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
@@ -1,6 +1,5 @@
1
- from typing import Any, Dict, Iterator, List
2
-
3
1
  from modelscope import MsDataset
2
+ from typing import Any, Dict, Iterator, List
4
3
 
5
4
  from evalscope.perf.arguments import Arguments
6
5
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
@@ -1,8 +1,7 @@
1
+ import json
1
2
  import subprocess
2
3
  from typing import Any, Dict, Iterator, List
3
4
 
4
- import json
5
-
6
5
  from evalscope.perf.arguments import Arguments
7
6
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
8
7
  from evalscope.perf.plugin.registry import register_dataset
@@ -1,9 +1,8 @@
1
1
  import base64
2
+ import json
2
3
  import pickle
3
4
  import sqlite3
4
5
 
5
- import json
6
-
7
6
  result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
8
7
  con = sqlite3.connect(result_db_path)
9
8
  query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
@@ -1,9 +1,8 @@
1
1
  import time
2
+ import torch
2
3
  from dataclasses import dataclass, field
3
4
  from typing import Any, List, Optional, Tuple
4
5
 
5
- import torch
6
-
7
6
  from evalscope.utils.logger import get_logger
8
7
 
9
8
  logger = get_logger()
@@ -1,11 +1,10 @@
1
1
  import base64
2
+ import json
2
3
  import os
3
4
  import pickle
4
5
  import sqlite3
5
6
  import sys
6
7
  from datetime import datetime
7
-
8
- import json
9
8
  from tabulate import tabulate
10
9
 
11
10
  from evalscope.perf.arguments import Arguments
@@ -88,15 +87,19 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
88
87
  cursor.execute(query, common_columns)
89
88
 
90
89
 
91
- def get_result_db_path(name, model):
90
+ def get_output_path(args: Arguments) -> str:
92
91
  current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
93
- output_dir = './outputs'
94
- result_db_path = os.path.join(output_dir, f'{name or model}_perf', current_time, 'benchmark_data.db')
92
+ output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
93
+ if not os.path.exists(output_path):
94
+ os.makedirs(output_path, exist_ok=True)
95
+ logger.info(f'Save the result to: {output_path}')
96
+ return output_path
97
+
95
98
 
96
- if not os.path.exists(os.path.dirname(result_db_path)):
97
- os.makedirs(os.path.dirname(result_db_path), exist_ok=True)
99
+ def get_result_db_path(args: Arguments):
100
+ result_db_path = os.path.join(args.outputs_dir, 'benchmark_data.db')
98
101
 
99
- logger.info(f'Save the result to: {result_db_path}')
102
+ logger.info(f'Save the data base to: {result_db_path}')
100
103
  if os.path.exists(result_db_path):
101
104
  logger.warning('The db file exists, delete it and start again!.')
102
105
  sys.exit(1)
@@ -1,16 +1,15 @@
1
1
  import os
2
2
  import subprocess
3
- from contextlib import asynccontextmanager
4
- from dataclasses import dataclass
5
-
6
3
  import torch
7
4
  import uvicorn
5
+ from contextlib import asynccontextmanager
6
+ from dataclasses import dataclass
8
7
  from fastapi import FastAPI
9
8
  from fastapi.middleware.cors import CORSMiddleware
10
9
  from sse_starlette.sse import EventSourceResponse
11
10
 
12
11
  from evalscope.perf.arguments import Arguments
13
- from evalscope.perf.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
12
+ from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
14
13
  from evalscope.utils.logger import get_logger
15
14
 
16
15
  logger = get_logger()
@@ -66,9 +65,9 @@ async def lifespan(app: FastAPI):
66
65
  torch.cuda.empty_cache()
67
66
 
68
67
 
69
- def create_app(args) -> FastAPI:
68
+ def create_app(model, attn_implementation=None) -> FastAPI:
70
69
  app = FastAPI(lifespan=lifespan)
71
- chat_service = ChatService(model_path=args.model, attn_implementation=args.attn_implementation)
70
+ chat_service = ChatService(model_path=model, attn_implementation=attn_implementation)
72
71
 
73
72
  app.add_middleware(
74
73
  CORSMiddleware,
@@ -98,18 +97,25 @@ def create_app(args) -> FastAPI:
98
97
 
99
98
  def start_app(args: Arguments):
100
99
  if args.api == 'local':
101
- app = create_app(args)
102
- uvicorn.run(app, host='0.0.0.0', port=8877, workers=1)
100
+ app = create_app(args.model, args.attn_implementation)
101
+ uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
103
102
 
104
103
  elif args.api == 'local_vllm':
105
104
  os.environ['VLLM_USE_MODELSCOPE'] = 'True'
106
-
105
+ # yapf: disable
107
106
  proc = subprocess.Popen([
108
- 'python', '-m', 'vllm.entrypoints.openai.api_server', '--model', args.model, '--served-model-name',
109
- os.path.basename(args.model), '--tensor-parallel-size',
110
- str(torch.cuda.device_count()), '--max-model-len', '32768', '--gpu-memory-utilization', '0.9', '--host',
111
- '0.0.0.0', '--port', '8877', '--disable-log-requests', '--disable-log-stats'
107
+ 'python', '-m', 'vllm.entrypoints.openai.api_server',
108
+ '--model', args.model,
109
+ '--served-model-name', args.model,
110
+ '--tensor-parallel-size', str(torch.cuda.device_count()),
111
+ '--max-model-len', '32768',
112
+ '--gpu-memory-utilization', '0.9',
113
+ '--host', '0.0.0.0',
114
+ '--port', args.port,
115
+ '--disable-log-requests',
116
+ '--disable-log-stats',
112
117
  ])
118
+ # yapf: enable
113
119
  import atexit
114
120
 
115
121
  def on_exit():
@@ -21,7 +21,7 @@ answers_gen:
21
21
  model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
22
22
  revision: NULL # revision of model, default is NULL
23
23
  precision: torch.float16
24
- enable: true # enable or disable this model
24
+ enable: true # enable or disable this model
25
25
  template_type: default-generation
26
26
  generation_config:
27
27
  do_sample: true
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -22,8 +22,7 @@ model: null # Note: to be implemented as CustomModel
22
22
  eval_type: custom
23
23
  datasets:
24
24
  - arc
25
- outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
26
25
  use_cache: false
27
26
  stage: all
28
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
+ dataset_hub: modelscope # `Local` or `ModelScope`
29
28
  limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
20
20
  eval_type: custom
21
21
  datasets:
22
22
  - bbh
23
- outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
23
  use_cache: false
25
24
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
25
+ dataset_hub: modelscope # `Local` or `ModelScope`
26
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
20
20
  eval_type: custom
21
21
  datasets:
22
22
  - bbh
23
- outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
23
  use_cache: false
25
24
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
25
+ dataset_hub: modelscope # `Local` or `ModelScope`
26
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
23
23
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
24
  use_cache: false
25
25
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
26
+ dataset_hub: modelscope # `Local` or `ModelScope`
27
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
20
20
  eval_type: custom
21
21
  datasets:
22
22
  - ceval
23
- outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
23
  use_cache: false
25
24
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
25
+ dataset_hub: modelscope # `Local` or `ModelScope`
26
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
23
23
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
24
  use_cache: false
25
25
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
26
+ dataset_hub: modelscope # `Local` or `ModelScope`
27
+ limit: null
@@ -24,5 +24,5 @@ datasets:
24
24
  outputs: ./outputs/eval_qwen-7b-chat_v100 # Directory to save the outputs, structure: logs, predictions, reviews, reports
25
25
  use_cache: false
26
26
  stage: all
27
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
+ dataset_hub: modelscope # `Local` or `ModelScope`
28
28
  limit: 10
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -25,5 +25,5 @@ datasets:
25
25
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
26
26
  use_cache: false
27
27
  stage: all
28
- dataset_hub: ModelScope # `Local` or `ModelScope`
28
+ dataset_hub: modelscope # `Local` or `ModelScope`
29
29
  limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -25,5 +25,5 @@ datasets:
25
25
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
26
26
  use_cache: true
27
27
  stage: all
28
- dataset_hub: ModelScope # `Local` or `ModelScope`
29
- limit: null
28
+ dataset_hub: modelscope # `Local` or `ModelScope`
29
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
23
23
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
24
  use_cache: false
25
25
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
26
+ dataset_hub: modelscope # `Local` or `ModelScope`
27
+ limit: null