evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,35 +1,25 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI, Inc. and its affiliates.
3
3
  # flake8: noqa
4
+ import numpy as np
4
5
  import os
5
6
  import sys
6
- from typing import List, Any, Union, Dict
7
- import numpy as np
8
7
  import time
8
+ import torch
9
9
  from abc import ABC, abstractmethod
10
10
  from copy import deepcopy
11
-
12
- import torch
11
+ from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
13
12
  from torch import dtype
13
+ from typing import Any, Dict, List, Union
14
14
 
15
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
15
+ from evalscope.constants import DEFAULT_MODEL_CACHE_DIR
16
16
  from evalscope.models.custom import CustomModel
17
- from evalscope.models.template import get_template, StopWordsCriteria
17
+ from evalscope.utils.chat_service import ChatMessage
18
18
  from evalscope.utils.logger import get_logger
19
- from transformers import StoppingCriteriaList
19
+ from evalscope.utils.model_utils import fix_do_sample_warning
20
20
 
21
21
  logger = get_logger()
22
22
 
23
- # Notes:
24
- # - modelscope>=1.9.5
25
-
26
-
27
- def get_model_cache_dir(root_cache_dir: str):
28
- model_cache_dir = os.path.join(root_cache_dir, 'models')
29
- model_cache_dir = os.path.expanduser(model_cache_dir)
30
- os.makedirs(model_cache_dir, exist_ok=True)
31
- return model_cache_dir
32
-
33
23
 
34
24
  class BaseModelAdapter(ABC):
35
25
  """
@@ -69,7 +59,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
69
59
  torch_dtype: dtype = torch.bfloat16,
70
60
  model_revision: str = None,
71
61
  max_length: int = None,
72
- cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
62
+ cache_dir: str = None,
73
63
  **kwargs):
74
64
  """
75
65
  Args:
@@ -80,11 +70,11 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
80
70
  max_length: The max length of input sequence. Default: None.
81
71
  **kwargs: Other args.
82
72
  """
83
- model_cache_dir = get_model_cache_dir(cache_dir)
73
+ model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
84
74
 
85
75
  self.model_id: str = model_id
86
76
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
87
- logger.warning(f'**Device: {self.device}')
77
+ logger.warning(f'Device: {self.device}')
88
78
 
89
79
  torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
90
80
 
@@ -93,31 +83,21 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
93
83
  model_cfg['device_map'] = device_map
94
84
  model_cfg['torch_dtype'] = str(torch_dtype)
95
85
 
96
- from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
97
- # from modelscope import snapshot_download
98
-
99
- # try:
100
- # model_dir = snapshot_download(self.model_id, cache_dir=model_cache_dir, local_files_only=True)
101
- # logger.warning('**Use local_files_only to load model **')
102
- # except:
103
- # model_dir = snapshot_download(self.model_id,
104
- # revision=model_revision,
105
- # cache_dir=model_cache_dir, )
106
- # logger.warning('**Load model from ModelScope hub **')
107
-
108
- tokenizer = AutoTokenizer.from_pretrained(self.model_id, # self.model_id
109
- revision=model_revision,
110
- trust_remote_code=True,
111
- cache_dir=model_cache_dir,)
112
-
113
- model = AutoModelForCausalLM.from_pretrained(self.model_id, # self.model_id
114
- revision=model_revision,
115
- device_map=device_map,
116
- trust_remote_code=True,
117
- torch_dtype=torch_dtype,
118
- cache_dir=model_cache_dir,)
119
-
120
- # model.generation_config = GenerationConfig.from_pretrained(model_id, trust_remote_code=True)
86
+ tokenizer = AutoTokenizer.from_pretrained(
87
+ self.model_id, # self.model_id
88
+ revision=model_revision,
89
+ trust_remote_code=True,
90
+ cache_dir=model_cache_dir,
91
+ )
92
+
93
+ model = AutoModelForCausalLM.from_pretrained(
94
+ self.model_id, # self.model_id
95
+ revision=model_revision,
96
+ device_map=device_map,
97
+ trust_remote_code=True,
98
+ torch_dtype=torch_dtype,
99
+ cache_dir=model_cache_dir,
100
+ )
121
101
 
122
102
  super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
123
103
 
@@ -187,18 +167,16 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
187
167
  if softval.dtype in {torch.bfloat16, torch.float16}:
188
168
  softval = softval.to(dtype=torch.float32)
189
169
  probs = softval.detach().cpu().numpy()
190
- pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
170
+ pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
191
171
 
192
172
  res_d = {
193
- 'choices': [
194
- {
195
- 'index': 0,
196
- 'message': {
197
- 'content': pred,
198
- 'role': 'assistant'
199
- }
173
+ 'choices': [{
174
+ 'index': 0,
175
+ 'message': {
176
+ 'content': pred,
177
+ 'role': 'assistant'
200
178
  }
201
- ],
179
+ }],
202
180
  'created': time.time(),
203
181
  'model': self.model_id,
204
182
  'object': 'chat.completion',
@@ -226,7 +204,7 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
226
204
  device_map: str = 'auto',
227
205
  torch_dtype: dtype = torch.bfloat16,
228
206
  model_revision: str = None,
229
- cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
207
+ cache_dir: str = None,
230
208
  **kwargs):
231
209
  """
232
210
  Continuation-logits model adapter.
@@ -239,12 +217,13 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
239
217
  **kwargs: Other args.
240
218
  """
241
219
 
242
- super().__init__(model_id=model_id,
243
- device_map=device_map,
244
- torch_dtype=torch_dtype,
245
- model_revision=model_revision,
246
- cache_dir=cache_dir,
247
- **kwargs)
220
+ super().__init__(
221
+ model_id=model_id,
222
+ device_map=device_map,
223
+ torch_dtype=torch_dtype,
224
+ model_revision=model_revision,
225
+ cache_dir=cache_dir,
226
+ **kwargs)
248
227
 
249
228
  @torch.no_grad()
250
229
  def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
@@ -282,15 +261,13 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
282
261
  pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
283
262
 
284
263
  res_d = {
285
- 'choices': [
286
- {
287
- 'index': 0,
288
- 'message': {
289
- 'content': pred_list,
290
- 'role': 'assistant'
291
- }
264
+ 'choices': [{
265
+ 'index': 0,
266
+ 'message': {
267
+ 'content': pred_list,
268
+ 'role': 'assistant'
292
269
  }
293
- ],
270
+ }],
294
271
  'created': time.time(),
295
272
  'model': self.model_id,
296
273
  'object': 'chat.completion',
@@ -347,10 +324,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
347
324
 
348
325
  def __init__(self,
349
326
  model_id: str,
350
- model_revision: str,
327
+ model_revision: str = 'master',
351
328
  device_map: str = 'auto',
352
- torch_dtype: dtype = torch.float16,
353
- cache_dir: str = DEFAULT_ROOT_CACHE_DIR,
329
+ torch_dtype: dtype = 'auto',
330
+ cache_dir: str = None,
354
331
  **kwargs):
355
332
  """
356
333
  Chat completion model adapter. Tasks of chat and generation are supported.
@@ -359,17 +336,18 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
359
336
  model_id: The model id on ModelScope, or local model_dir.
360
337
  model_revision: The model revision on ModelScope. Default: None.
361
338
  device_map: The device map for model inference.
362
- torch_dtype: The torch dtype for model inference. Default: torch.float16.
339
+ torch_dtype: The torch dtype for model inference. Default: 'auto'.
363
340
  **kwargs: Other args.
364
341
  """
365
342
 
366
343
  custom_generation_config = kwargs.pop('generation_config', None)
367
- model_cache_dir = get_model_cache_dir(root_cache_dir=cache_dir)
344
+ custom_chat_template = kwargs.pop('chat_template', None)
345
+ model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
368
346
 
369
347
  self.model_id: str = model_id
370
348
  self.model_revision: str = model_revision
371
349
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
372
- logger.warning(f'**Device: {self.device}')
350
+ logger.warning(f'Device: {self.device}')
373
351
 
374
352
  torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
375
353
 
@@ -378,72 +356,47 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
378
356
  model_cfg['device_map'] = device_map
379
357
  model_cfg['torch_dtype'] = str(torch_dtype)
380
358
 
381
- self.template_type = kwargs.pop('template_type', None)
382
- logger.warning(f'**Template type: {self.template_type}')
383
-
384
- from evalscope.models.template import TemplateType
385
- if isinstance(self.model_id, str) \
386
- and os.path.isdir(os.path.expanduser(self.model_id)) \
387
- and self.template_type is None:
388
- raise ValueError(f'Please specify the --template-type for local model dir.\n'
389
- f'Available template types: {TemplateType.get_template_name_list()}\n'
390
- f'Refer to `https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md` for more details.')
391
-
392
- from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer
393
- # from modelscope import snapshot_download
394
-
395
- # try:
396
- # model_dir = snapshot_download(self.model_id, cache_dir=model_cache_dir, local_files_only=True)
397
- # logger.warning('**Use local_files_only to load model **')
398
- # except:
399
- # model_dir = snapshot_download(self.model_id,
400
- # revision=model_revision,
401
- # cache_dir=model_cache_dir, )
402
- # logger.warning('**Load model from ModelScope hub **')
403
-
404
- tokenizer = AutoTokenizer.from_pretrained(self.model_id,
405
- revision=model_revision,
406
- trust_remote_code=True,
407
- cache_dir=model_cache_dir,)
408
-
409
- model = AutoModelForCausalLM.from_pretrained(self.model_id,
410
- revision=model_revision,
411
- device_map=device_map,
412
- trust_remote_code=True,
413
- torch_dtype=torch_dtype,
414
- cache_dir=model_cache_dir,)
415
-
416
- self.origin_tokenizer = deepcopy(tokenizer)
417
-
418
- self.generation_config, self.generation_template = self._parse_generation_config(tokenizer, model)
359
+ tokenizer = AutoTokenizer.from_pretrained(
360
+ self.model_id,
361
+ revision=model_revision,
362
+ trust_remote_code=True,
363
+ cache_dir=model_cache_dir,
364
+ )
365
+
366
+ model = AutoModelForCausalLM.from_pretrained(
367
+ self.model_id,
368
+ revision=model_revision,
369
+ device_map=device_map,
370
+ trust_remote_code=True,
371
+ torch_dtype=torch_dtype,
372
+ cache_dir=model_cache_dir,
373
+ )
374
+
375
+ self.generation_config = self._parse_generation_config(tokenizer, model)
419
376
 
420
377
  if custom_generation_config:
421
- logger.info('**Updating generation config ...')
422
- self.generation_config.update(**custom_generation_config.to_dict())
423
- logger.info(f'**Generation config init: {self.generation_config.to_dict()}')
378
+ logger.info('Updating generation config ...')
379
+ self.generation_config.update(**custom_generation_config)
424
380
 
425
- super().__init__(model=model, tokenizer=self.generation_template.tokenizer, model_cfg=model_cfg)
381
+ if custom_chat_template:
382
+ tokenizer.chat_template = custom_chat_template
383
+ logger.info(f'Using custom chat template: {custom_chat_template}')
426
384
 
427
- def _parse_generation_config(self, tokenizer, model):
428
- from modelscope.utils.hf_util import GenerationConfig
385
+ super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
429
386
 
430
- generation_config = getattr(model, 'generation_config', GenerationConfig())
387
+ def _parse_generation_config(self, tokenizer, model):
388
+ generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
431
389
 
432
390
  try:
433
391
  remote_config = GenerationConfig.from_pretrained(
434
- self.model_id,
435
- revision=self.model_revision,
436
- trust_remote_code=True)
392
+ self.model_id, revision=self.model_revision, trust_remote_code=True)
437
393
  generation_config.update(**remote_config.to_dict())
438
394
  except:
439
395
  logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
440
396
 
441
- # Parse templates for chat-completion
442
397
  if isinstance(self.model_id, str) and os.path.exists(self.model_id):
443
398
  logger.warning(f'Got local model dir: {self.model_id}')
444
399
 
445
- generation_template = get_template(template_type=self.template_type, tokenizer=tokenizer)
446
-
447
400
  if tokenizer.eos_token_id is not None:
448
401
  generation_config.eos_token_id = tokenizer.eos_token_id
449
402
  if tokenizer.pad_token_id is not None:
@@ -451,24 +404,19 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
451
404
  if generation_config.max_new_tokens is None:
452
405
  generation_config.max_new_tokens = 2048
453
406
 
454
- return generation_config, generation_template
407
+ return generation_config
455
408
 
456
409
  def _model_generate(self, query: str, infer_cfg: dict) -> str:
457
- example = dict(query=query,
458
- history=[],
459
- system=None)
460
-
461
- inputs, _ = self.generation_template.encode(example)
410
+ messages = [ChatMessage(role='user', content=query)]
411
+ formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
412
+ inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
462
413
  input_ids = inputs['input_ids']
463
- input_ids = torch.tensor(input_ids)[None].to(self.device)
464
- attention_mask = torch.ones_like(input_ids).to(self.device)
465
414
 
466
415
  # Process infer_cfg
467
- infer_cfg = infer_cfg or {}
468
416
  if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
469
417
  infer_cfg['do_sample'] = True
470
418
 
471
- # TODO: stop settings
419
+ # stop settings
472
420
  stop = infer_cfg.get('stop', None)
473
421
  eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
474
422
  if stop else self.tokenizer.eos_token_id
@@ -478,25 +426,16 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
478
426
  infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
479
427
 
480
428
  self.generation_config.update(**infer_cfg)
481
-
482
- # stopping
483
- stop_words = [self.generation_template.suffix[-1]]
484
- decode_kwargs = {}
485
- stopping_criteria = StoppingCriteriaList(
486
- [StopWordsCriteria(self.tokenizer, stop_words, **decode_kwargs)])
429
+ fix_do_sample_warning(self.generation_config)
487
430
 
488
431
  # Run inference
489
- output_ids = self.model.generate(
490
- input_ids=input_ids,
491
- attention_mask=attention_mask,
492
- generation_config=self.generation_config,
493
- stopping_criteria=stopping_criteria, )
432
+ output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
494
433
 
495
- response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], True, **decode_kwargs)
434
+ response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
496
435
  return response
497
436
 
498
437
  @torch.no_grad()
499
- def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = dict({})) -> dict:
438
+ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
500
439
 
501
440
  # Process inputs
502
441
  if isinstance(inputs, str):
@@ -510,12 +449,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
510
449
 
511
450
  response = self._model_generate(query, infer_cfg)
512
451
 
513
- choices_list = [
514
- {'index': 0,
515
- 'message': {'content': response,
516
- 'role': 'assistant'}
517
- }
518
- ]
452
+ choices_list = [{'index': 0, 'message': {'content': response, 'role': 'assistant'}}]
519
453
 
520
454
  res_d = {
521
455
  'choices': choices_list,
@@ -589,4 +523,3 @@ class CustomModelAdapter(BaseModelAdapter):
589
523
  raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
590
524
 
591
525
  return self.custom_model.predict(prompts=in_prompts, **kwargs)
592
-
@@ -1,10 +1,9 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ import openai
3
4
  import os
4
5
  import time
5
6
 
6
- import openai
7
-
8
7
  from evalscope.models import ChatBaseModel
9
8
  from evalscope.utils.logger import get_logger
10
9
 
@@ -43,22 +42,25 @@ class OpenAIModel(ChatBaseModel):
43
42
 
44
43
  logger.info(f'Using OpenAI model_id: {model_id}')
45
44
 
46
- res = self._predict(model_id=model_id,
47
- sys_prompt=sys_prompt,
48
- user_prompt=user_prompt,
49
- temperature=temperature,
50
- max_tokens=max_tokens,
51
- mode=mode)
45
+ res = self._predict(
46
+ model_id=model_id,
47
+ sys_prompt=sys_prompt,
48
+ user_prompt=user_prompt,
49
+ temperature=temperature,
50
+ max_tokens=max_tokens,
51
+ mode=mode)
52
52
 
53
53
  return res
54
54
 
55
- def _predict(self,
56
- model_id,
57
- sys_prompt,
58
- user_prompt,
59
- temperature,
60
- max_tokens,
61
- mode: str = 'chat.completion',) -> dict:
55
+ def _predict(
56
+ self,
57
+ model_id,
58
+ sys_prompt,
59
+ user_prompt,
60
+ temperature,
61
+ max_tokens,
62
+ mode: str = 'chat.completion',
63
+ ) -> dict:
62
64
 
63
65
  res = {}
64
66
  openai.api_key = self.api_key
@@ -82,9 +84,8 @@ class OpenAIModel(ChatBaseModel):
82
84
  ans_text = resp['choices'][0]['message']['content']
83
85
  model_id = resp['model']
84
86
  else:
85
- logger.warning(
86
- f'OpenAI GPT API call failed: got empty response '
87
- f'for input {sys_prompt} {user_prompt}')
87
+ logger.warning(f'OpenAI GPT API call failed: got empty response '
88
+ f'for input {sys_prompt} {user_prompt}')
88
89
  ans_text = ''
89
90
  model_id = ''
90
91
 
@@ -98,6 +99,5 @@ class OpenAIModel(ChatBaseModel):
98
99
  except Exception as e:
99
100
  logger.warning(f'OpenAI API call failed: {e}')
100
101
  time.sleep(3)
101
- logger.error(
102
- f'OpenAI API call failed after {self.MAX_RETRIES} retries')
102
+ logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
103
103
  return res
@@ -1,18 +1,22 @@
1
1
  import argparse
2
+ import json
3
+ import os
2
4
  import sys
3
5
  from dataclasses import dataclass, field
4
6
  from typing import Any, Dict, List, Optional
5
7
 
6
- import json
8
+ from evalscope.constants import DEFAULT_WORK_DIR
7
9
 
8
10
 
9
11
  @dataclass
10
12
  class Arguments:
11
13
  # Model and API
12
- model: str # Model identifier
14
+ model: str # Model name or path
15
+ model_id: Optional[str] = None # Model identifier
13
16
  attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
14
17
  api: str = 'openai' # API to be used (default: 'openai')
15
18
  tokenizer_path: Optional[str] = None # Path to the tokenizer
19
+ port: str = '8877' # Port number for the local API server
16
20
 
17
21
  # Connection settings
18
22
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
@@ -32,6 +36,9 @@ class Arguments:
32
36
  wandb_api_key: Optional[str] = None # WandB API key for logging
33
37
  name: Optional[str] = None # Name for the run
34
38
 
39
+ # Output settings
40
+ outputs_dir: str = DEFAULT_WORK_DIR
41
+
35
42
  # Prompt settings
36
43
  max_prompt_length: int = sys.maxsize # Maximum length of the prompt
37
44
  min_prompt_length: int = 0 # Minimum length of the prompt
@@ -57,7 +64,6 @@ class Arguments:
57
64
 
58
65
  @staticmethod
59
66
  def from_args(args):
60
-
61
67
  return Arguments(
62
68
  model=args.model,
63
69
  attn_implementation=args.attn_implementation,
@@ -72,6 +78,7 @@ class Arguments:
72
78
  headers=args.headers,
73
79
  wandb_api_key=args.wandb_api_key,
74
80
  name=args.name,
81
+ outputs_dir=args.outputs_dir,
75
82
  debug=args.debug,
76
83
  tokenizer_path=args.tokenizer_path,
77
84
  api=args.api,
@@ -98,6 +105,7 @@ class Arguments:
98
105
  if self.api_key:
99
106
  # Assuming the API key is used as a Bearer token
100
107
  self.headers['Authorization'] = f'Bearer {self.api_key}'
108
+ self.model_id = os.path.basename(self.model)
101
109
 
102
110
  def __str__(self):
103
111
  return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
@@ -152,6 +160,9 @@ def add_argument(parser: argparse.ArgumentParser):
152
160
  parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
153
161
  parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
154
162
 
163
+ # Output settings
164
+ parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
165
+
155
166
  # Dataset settings
156
167
  parser.add_argument('--dataset', type=str, default='openqa', help='Specify the dataset')
157
168
  parser.add_argument('--dataset-path', type=str, required=False, help='Path to the dataset file')
@@ -170,6 +181,7 @@ def add_argument(parser: argparse.ArgumentParser):
170
181
  parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
171
182
  parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
172
183
  parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
184
+
173
185
  # yapf: enable
174
186
 
175
187
 
@@ -1,16 +1,15 @@
1
1
  import asyncio
2
2
  import copy
3
+ import json
4
+ import numpy as np
3
5
  import os
4
6
  import platform
5
7
  import sqlite3
6
8
  import threading
7
9
  import time
8
10
  from http import HTTPStatus
9
- from typing import List
10
-
11
- import json
12
- import numpy as np
13
11
  from tqdm import tqdm
12
+ from typing import List
14
13
 
15
14
  from evalscope.perf.arguments import Arguments
16
15
  from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -138,17 +137,17 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
138
137
  api_plugin_class = ApiRegistry(args.api)
139
138
  api_plugin = api_plugin_class(args.tokenizer_path)
140
139
 
141
- result_db_path = get_result_db_path(args.name, args.model)
140
+ result_db_path = get_result_db_path(args)
142
141
  # Initialize wandb
143
142
  if args.wandb_api_key:
144
- import wandb
145
143
  import datetime
144
+ import wandb
146
145
  os.environ['WANDB_SILENT'] = 'true'
147
- os.environ['WANDB_DIR'] = './outputs'
146
+ os.environ['WANDB_DIR'] = args.outputs_dir
148
147
 
149
148
  wandb.login(key=args.wandb_api_key)
150
149
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
151
- name = args.name if args.name else f'{args.model}_{current_time}'
150
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
152
151
  wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
153
152
 
154
153
  with sqlite3.connect(result_db_path) as con:
@@ -199,7 +198,6 @@ async def start_server(args: Arguments) -> bool:
199
198
  args.url = 'http://127.0.0.1:8877/v1/completions'
200
199
  else:
201
200
  args.url = 'http://127.0.0.1:8877/v1/chat/completions'
202
- args.model = os.path.basename(args.model)
203
201
 
204
202
  if not await test_connection(args):
205
203
  raise TimeoutError('Test connection failed')
@@ -1,12 +1,10 @@
1
+ import aiohttp
1
2
  import asyncio
2
- import logging
3
+ import json
3
4
  import time
4
5
  from http import HTTPStatus
5
6
  from typing import AsyncGenerator, Dict, List, Tuple
6
7
 
7
- import aiohttp
8
- import json
9
-
10
8
  from evalscope.perf.arguments import Arguments
11
9
  from evalscope.perf.utils.local_server import ServerSentEvent
12
10
  from evalscope.utils.logger import get_logger
@@ -21,7 +19,6 @@ class AioHttpClient:
21
19
  args: Arguments,
22
20
  ):
23
21
  self.url = args.url
24
- self.debug = args.debug
25
22
  self.headers = {'user-agent': 'modelscope_bench', **(args.headers or {})}
26
23
  self.read_timeout = args.read_timeout
27
24
  self.connect_timeout = args.connect_timeout
@@ -31,9 +28,7 @@ class AioHttpClient:
31
28
  connect=self.connect_timeout,
32
29
  sock_read=self.read_timeout),
33
30
  connector=aiohttp.TCPConnector(limit=1),
34
- trace_configs=[self._create_trace_config()] if self.debug else [])
35
- if self.debug:
36
- get_logger(log_level=logging.DEBUG)
31
+ trace_configs=[self._create_trace_config()] if args.debug else [])
37
32
 
38
33
  def _create_trace_config(self):
39
34
  trace_config = aiohttp.TraceConfig()
evalscope/perf/main.py CHANGED
@@ -1,9 +1,12 @@
1
1
  import asyncio
2
+ import logging
3
+ import os
2
4
  import platform
3
5
  from argparse import Namespace
4
6
 
5
7
  from evalscope.perf.arguments import Arguments, parse_args
6
8
  from evalscope.perf.benchmark import benchmark
9
+ from evalscope.perf.utils.db_util import get_output_path
7
10
  from evalscope.perf.utils.handler import add_signal_handlers
8
11
  from evalscope.utils.logger import get_logger
9
12
  from evalscope.utils.utils import seed_everything
@@ -18,6 +21,13 @@ def run_perf_benchmark(args):
18
21
  args = Arguments.from_args(args)
19
22
  seed_everything(args.seed)
20
23
 
24
+ # Setup logger and output
25
+ args.outputs_dir = get_output_path(args)
26
+ get_logger(log_file=os.path.join(args.outputs_dir, 'benchmark.log'), force=True)
27
+
28
+ if args.debug:
29
+ get_logger(log_level=logging.DEBUG, force=True)
30
+
21
31
  logger.info('Starting benchmark...')
22
32
  logger.info(args)
23
33
 
@@ -1,7 +1,6 @@
1
- from typing import Any, Dict, Iterator, List
2
-
3
1
  import json
4
2
  from transformers import AutoTokenizer
3
+ from typing import Any, Dict, Iterator, List
5
4
 
6
5
  from evalscope.perf.arguments import Arguments
7
6
  from evalscope.perf.plugin.api.base import ApiPluginBase