evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import os
4
- import json
5
3
  import glob
4
+ import json
5
+ import os
6
+ from collections import defaultdict
6
7
  from tabulate import tabulate
8
+
7
9
  from evalscope.utils.logger import get_logger
8
10
 
9
11
  logger = get_logger()
10
-
11
12
  """
12
13
  Combine and generate table for reports of LLMs.
13
14
  """
@@ -15,33 +16,29 @@ Combine and generate table for reports of LLMs.
15
16
 
16
17
  def get_report(report_file: str):
17
18
  data_d: dict = json.load(open(report_file, 'r'))
18
- dataset_name = data_d['name']
19
- score = data_d['score'] # float or dict
19
+ dataset_name = data_d['dataset_name']
20
+ model_name = data_d['model_name']
21
+ score = data_d['score'] # float or dict
22
+ metric = data_d['metric']
20
23
  score_d = {}
21
24
  if isinstance(score, dict):
22
- # score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
23
25
  score_d = score
24
26
  elif isinstance(score, float):
25
- # score_d['acc'] = round(score, 4) * 100
26
- score_d['acc'] = score
27
+ score_d[metric] = score
27
28
  else:
28
29
  raise ValueError(f'Unknown score type: {type(score)}')
29
- # score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
30
30
  score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
31
31
 
32
- return {'dataset_name': dataset_name, 'score': score_str}
32
+ return model_name, {'dataset_name': dataset_name, 'score': score_str}
33
33
 
34
34
 
35
35
  def get_model_reports(model_report_dir: str):
36
36
  model_report_dir = os.path.normpath(model_report_dir)
37
- model_report_dir = model_report_dir.rstrip('reports')
38
- model_info = os.path.basename(os.path.normpath(model_report_dir))
39
- model_name = '_'.join(model_info.split('_')[:-1][3:])
40
- report_files = glob.glob(os.path.join(model_report_dir, 'reports', '*.json'))
37
+ report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
41
38
 
42
- model_reports_d = {model_name: []}
39
+ model_reports_d = defaultdict(list)
43
40
  for file_path in report_files:
44
- report_d = get_report(file_path)
41
+ model_name, report_d = get_report(file_path)
45
42
  model_reports_d[model_name].append(report_d)
46
43
 
47
44
  return model_reports_d
@@ -55,8 +52,6 @@ def gen_table(reports_path_list: list):
55
52
  for report_path in reports_path_list:
56
53
  model_reports_d = get_model_reports(report_path)
57
54
  for model_name, report_list in model_reports_d.items():
58
- # report_list: [{'dataset_name': 'CompetitionMath', 'score': '4.42 (acc)'},
59
- # {'dataset_name': 'GSM8K', 'score': '28.51 (acc)'}]
60
55
  report_list = sorted(report_list, key=lambda x: x['dataset_name'])
61
56
  if not is_headers_set:
62
57
  headers.extend([x['dataset_name'] for x in report_list])
@@ -71,37 +66,34 @@ def gen_table(reports_path_list: list):
71
66
  report_table = tabulate(table_values, headers=headers, tablefmt='grid')
72
67
  return report_table
73
68
 
69
+
74
70
  class ReportsRecorder:
75
71
  COMMON_DATASET_PATH = []
76
72
  CUSTOM_DATASET_PATH = []
77
73
 
78
- def __init__(self, oss_url: str = "", endpoint: str = ""):
74
+ def __init__(self, oss_url: str = '', endpoint: str = ''):
79
75
  if oss_url and endpoint:
80
76
  import oss2
81
77
  from oss2.credentials import EnvironmentVariableCredentialsProvider
82
78
 
83
79
  auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
84
- oss_url = oss_url.replace("oss://", "").split('/')
80
+ oss_url = oss_url.replace('oss://', '').split('/')
85
81
  bucket_name = oss_url[0]
86
82
 
87
- self.object_path = "/".join(oss_url[1:])
83
+ self.object_path = '/'.join(oss_url[1:])
88
84
  self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
89
85
  else:
90
- self.object_path = ""
86
+ self.object_path = ''
91
87
  self.bucket = None
92
88
 
93
-
94
89
  def append_path(self, report_path: str, dataset_name: str):
95
- if dataset_name == "general_qa":
90
+ if dataset_name == 'general_qa':
96
91
  self.CUSTOM_DATASET_PATH.append(report_path)
97
92
  else:
98
93
  self.COMMON_DATASET_PATH.append(report_path)
99
-
94
+
100
95
  def dump_reports(self, output_dir: str):
101
- result = {
102
- "CommonDataset": [],
103
- "CustomDataset": []
104
- }
96
+ result = {'CommonDataset': [], 'CustomDataset': []}
105
97
  for line in self.COMMON_DATASET_PATH:
106
98
  with open(line, 'r') as f:
107
99
  report = json.load(f)
@@ -109,20 +101,21 @@ class ReportsRecorder:
109
101
  for line in self.CUSTOM_DATASET_PATH:
110
102
  with open(line, 'r') as f:
111
103
  report = json.load(f)
112
- report.update({"name": os.path.basename(line)})
104
+ report.update({'name': os.path.basename(line)})
113
105
  result['CustomDataset'].append(report)
114
-
106
+
115
107
  os.makedirs(output_dir, exist_ok=True)
116
- output_file_name = "metric.json"
108
+ output_file_name = 'metric.json'
117
109
  output_path = os.path.join(output_dir, output_file_name)
118
110
  with open(output_path, 'w+') as f:
119
111
  f.write(json.dumps(result, ensure_ascii=False, indent=4))
120
-
112
+
121
113
  if self.bucket:
122
114
  remote_path = os.path.join(self.object_path, output_file_name)
123
- logger.info(f"** Upload report to oss: {remote_path}")
115
+ logger.info(f'** Upload report to oss: {remote_path}')
124
116
  self.bucket.put_object_from_file(remote_path, output_path)
125
117
 
118
+
126
119
  if __name__ == '__main__':
127
120
  report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
128
121
  report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
@@ -4,12 +4,10 @@ import time
4
4
 
5
5
  from evalscope.models.custom import CustomModel
6
6
  from evalscope.run import run_task
7
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
8
- from evalscope.utils import yaml_to_dict
7
+ from evalscope.utils.io_utils import yaml_to_dict
9
8
  from evalscope.utils.logger import get_logger
10
9
 
11
10
  logger = get_logger()
12
-
13
11
  """
14
12
  This script is used to rewrite the evaluation results without re-running the model predictions.
15
13
  """
@@ -26,19 +24,20 @@ class DummyCustomModel(CustomModel):
26
24
  response = 'The answer is C. NOTE: ONLY FOR TEST'
27
25
 
28
26
  res_d: dict = {
29
- 'choices': [
30
- {
31
- 'index': 0,
32
- 'message': {
33
- # 'content': f'The answer is B. Raw prompt: {prompt}',
34
- 'content': response,
35
- 'role': 'assistant'
36
- }
27
+ 'choices': [{
28
+ 'index': 0,
29
+ 'message': {
30
+ # 'content': f'The answer is B. Raw prompt: {prompt}',
31
+ 'content': response,
32
+ 'role': 'assistant'
37
33
  }
38
- ],
39
- 'created': time.time(),
40
- 'model': self.config.get('model_id'), # should be model_id
41
- 'object': 'chat.completion',
34
+ }],
35
+ 'created':
36
+ time.time(),
37
+ 'model':
38
+ self.config.get('model_id'), # should be model_id
39
+ 'object':
40
+ 'chat.completion',
42
41
  'usage': {
43
42
  'completion_tokens': 0,
44
43
  'prompt_tokens': 0,
@@ -49,36 +48,6 @@ class DummyCustomModel(CustomModel):
49
48
  return [res_d for _ in prompts]
50
49
 
51
50
 
52
- def get_task_cfg(cfg_file: str, model_instance: CustomModel):
53
- if cfg_file:
54
- cfg_file: str = os.path.abspath(cfg_file)
55
- logger.info(f'Loading task config from {cfg_file}')
56
- task_cfg_d: dict = yaml_to_dict(yaml_file=cfg_file)
57
- task_cfg_d.update({'model': model_instance})
58
- logger.info(f'**Task config: {task_cfg_d}')
59
- else:
60
- # 默认config 示例
61
- task_cfg_d = {
62
- 'model_args': {},
63
- 'generation_config': {},
64
- 'dataset_args': {},
65
- 'dry_run': False,
66
- 'model': model_instance, # NOTE: model_id or # model_dir or model_instance(CustomModel)
67
- 'eval_type': 'custom', # NOTE: `checkpoint` or `custom` or `service`
68
- 'datasets': ['arc'],
69
- 'work_dir': DEFAULT_ROOT_CACHE_DIR,
70
- 'outputs': './outputs/eval_swift_dummy',
71
- 'mem_cache': False,
72
- 'dataset_hub': 'ModelScope',
73
- 'dataset_dir': DEFAULT_ROOT_CACHE_DIR,
74
- 'stage': 'all',
75
- 'limit': 10,
76
- 'debug': False
77
- }
78
-
79
- return task_cfg_d
80
-
81
-
82
51
  if __name__ == '__main__':
83
52
  # step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
84
53
  # step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
@@ -91,5 +60,4 @@ if __name__ == '__main__':
91
60
  task_cfg_d.update({'model': swift_model})
92
61
 
93
62
  eval_results: dict = run_task(task_cfg=task_cfg_d)
94
- print(f'** Evaluation results finished !\n')
95
-
63
+ print('** Evaluation results finished !\n')
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ from evalscope.constants import *
3
4
  from evalscope.utils.utils import *
4
- from evalscope.utils.task_utils import *
@@ -1,13 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) lmsys.org.
3
3
 
4
- import random
5
- from collections import OrderedDict, defaultdict
6
- from typing import List, Sequence, Union
7
-
8
4
  import numpy as np
9
5
  import pandas as pd
10
6
  import pyarrow as pa
7
+ import random
8
+ from collections import OrderedDict, defaultdict
9
+ from typing import List, Sequence, Union
11
10
 
12
11
  from evalscope.utils.logger import get_logger
13
12
 
@@ -25,9 +24,7 @@ def compute_elo(battles,
25
24
  init_rating=1000):
26
25
  rating = defaultdict(lambda: init_rating)
27
26
 
28
- for rd, model_a, model_b, win in battles[[
29
- col_model_a, col_model_b, col_win
30
- ]].itertuples():
27
+ for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
31
28
  ra = rating[model_a]
32
29
  rb = rating[model_b]
33
30
  ea = 1 / (1 + base**((rb - ra) / scale))
@@ -46,9 +43,7 @@ def compute_elo(battles,
46
43
  return rating
47
44
 
48
45
 
49
- def merge_ques_ans(answer_list_all,
50
- merge_key: str = 'question_id',
51
- merge_mode: str = 'inner') -> pd.DataFrame:
46
+ def merge_ques_ans(answer_list_all, merge_key: str = 'question_id', merge_mode: str = 'inner') -> pd.DataFrame:
52
47
  """
53
48
  Merge question and answer list to unifiled data.
54
49
 
@@ -67,18 +62,11 @@ def merge_ques_ans(answer_list_all,
67
62
  """
68
63
  ans_df = pd.DataFrame()
69
64
  for ans_list in answer_list_all:
70
- ans_list = [{
71
- 'question_id': item['question_id'],
72
- item['model_id']: item
73
- } for item in ans_list]
65
+ ans_list = [{'question_id': item['question_id'], item['model_id']: item} for item in ans_list]
74
66
  if ans_df.empty:
75
67
  ans_df = pa.Table.from_pylist(ans_list).to_pandas()
76
68
  else:
77
- ans_df = pd.merge(
78
- ans_df,
79
- pa.Table.from_pylist(ans_list).to_pandas(),
80
- on=merge_key,
81
- how=merge_mode)
69
+ ans_df = pd.merge(ans_df, pa.Table.from_pylist(ans_list).to_pandas(), on=merge_key, how=merge_mode)
82
70
 
83
71
  return ans_df
84
72
 
@@ -112,21 +100,17 @@ def get_battle_pairs(columns: List[str], baseline_idx: int = -1) -> List[tuple]:
112
100
 
113
101
  if baseline_idx != -1:
114
102
  n_column = columns[baseline_idx]
115
- res_list = [(column, n_column) for column in columns
116
- if column != n_column]
103
+ res_list = [(column, n_column) for column in columns if column != n_column]
117
104
  else:
118
105
  mat = np.ones((cols_num, cols_num))
119
106
  mat_lower_tril = np.tril(mat, k=-1)
120
107
  x_ids, y_ids = np.where(mat_lower_tril == 1)
121
- res_list = [(columns[x_id], columns[y_id])
122
- for x_id, y_id in zip(x_ids, y_ids)]
108
+ res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
123
109
 
124
110
  return res_list
125
111
 
126
112
 
127
- def get_battle_pairs_origin(columns: List[str],
128
- compare_base: bool = False,
129
- swap: bool = False): # TODO: to refactor
113
+ def get_battle_pairs_origin(columns: List[str], compare_base: bool = False, swap: bool = False): # TODO: to refactor
130
114
  """
131
115
  Get battle pair names from columns.
132
116
 
@@ -152,8 +136,7 @@ def get_battle_pairs_origin(columns: List[str],
152
136
  mat = np.ones((cols_num, cols_num))
153
137
  mat_lower_tril = np.tril(mat, k=-1)
154
138
  x_ids, y_ids = np.where(mat_lower_tril == 1)
155
- res_list = [(columns[x_id], columns[y_id])
156
- for x_id, y_id in zip(x_ids, y_ids)]
139
+ res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
157
140
  else:
158
141
  for column in columns[1:]:
159
142
  res_list.append((columns[0], column))
@@ -163,8 +146,7 @@ def get_battle_pairs_origin(columns: List[str],
163
146
  return res_list
164
147
 
165
148
 
166
- def shuffle_pairwise_preferences(
167
- df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
149
+ def shuffle_pairwise_preferences(df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
168
150
  """Shuffle the outputs of a pairwise preference dataframe.
169
151
 
170
152
  Examples
@@ -182,8 +164,7 @@ def shuffle_pairwise_preferences(
182
164
  df['output_2'] = np.where(arr_is_shuffle, col_1, col_2)
183
165
 
184
166
  if 'preference' in df.columns:
185
- df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'],
186
- df['preference'])
167
+ df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'], df['preference'])
187
168
 
188
169
  return df
189
170
 
@@ -202,20 +183,14 @@ class BattlePairSelection:
202
183
  # Make sure model_elo_map to be ordered when compare_base is true.
203
184
  self.model_elo_map = model_elo_map
204
185
 
205
- def top_k(self,
206
- k: int = DEFAULT_K,
207
- compare_base: bool = False,
208
- swap: bool = False) -> list:
186
+ def top_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
209
187
  if k <= 0:
210
188
  k = self.DEFAULT_K
211
189
  sorted_res = sorted(self.model_elo_map.items(), key=lambda x: x[1])[:k]
212
190
  sorted_res = list(dict(sorted_res).keys())
213
191
  return get_battle_pairs_origin(sorted_res, compare_base, swap)
214
192
 
215
- def random_k(self,
216
- k: int = DEFAULT_K,
217
- compare_base: bool = False,
218
- swap: bool = False) -> list:
193
+ def random_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
219
194
  if k <= 0:
220
195
  k = self.DEFAULT_K
221
196
  if k > len(self.model_elo_map):
@@ -226,21 +201,16 @@ class BattlePairSelection:
226
201
  res = list(res.keys())
227
202
  return get_battle_pairs_origin(res, compare_base, swap)
228
203
 
229
- def volatility_index(self,
230
- frac: float = 0.2,
231
- compare_base: bool = False,
232
- swap: bool = False) -> list:
204
+ def volatility_index(self, frac: float = 0.2, compare_base: bool = False, swap: bool = False) -> list:
233
205
  res_list = []
234
- candidate_list = get_battle_pairs_origin(
235
- list(self.model_elo_map.keys()), compare_base, swap)
206
+ candidate_list = get_battle_pairs_origin(list(self.model_elo_map.keys()), compare_base, swap)
236
207
  for t in candidate_list:
237
208
  model_a = t[0]
238
209
  model_b = t[1]
239
210
  base_elo_a = self.model_elo_map.get(model_a)
240
211
  base_elo_b = self.model_elo_map.get(model_b)
241
212
 
242
- vol_frac = abs(base_elo_b - base_elo_a) / max(
243
- base_elo_a, base_elo_b)
213
+ vol_frac = abs(base_elo_b - base_elo_a) / max(base_elo_a, base_elo_b)
244
214
  if vol_frac <= frac:
245
215
  res_list.append(t)
246
216
 
@@ -1,14 +1,13 @@
1
1
  import os
2
2
  import time
3
+ import torch
3
4
  from contextlib import contextmanager
4
5
  from functools import partial
5
- from threading import Thread
6
- from typing import List, Literal, Optional, Union
7
-
8
- import torch
9
6
  from modelscope import AutoModelForCausalLM, AutoTokenizer
10
7
  from pydantic import BaseModel, Field
8
+ from threading import Thread
11
9
  from transformers import TextIteratorStreamer
10
+ from typing import List, Literal, Optional, Union
12
11
 
13
12
 
14
13
  class Usage(BaseModel):
@@ -44,7 +43,7 @@ class DeltaMessage(BaseModel):
44
43
 
45
44
  class ChatCompletionRequest(BaseModel):
46
45
  model: str
47
- messages: List[ChatMessage] | str
46
+ messages: Union[List[ChatMessage], str]
48
47
  temperature: Optional[float] = None
49
48
  top_p: Optional[float] = None
50
49
  max_tokens: Optional[int] = 2048
@@ -4,7 +4,6 @@
4
4
  import ast
5
5
  import re
6
6
 
7
-
8
7
  # from . import utils as ann_utils
9
8
  from evalscope.constants import ArenaWinner
10
9
  from evalscope.utils.logger import get_logger
@@ -51,9 +50,7 @@ def lmsys_parser(completion, output_format):
51
50
  else:
52
51
  raise Exception('Invalid score pair.')
53
52
  except Exception as e:
54
- logger.error(
55
- f'{e}\nContent: {completion}\nYou must manually fix the score pair.'
56
- )
53
+ logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
57
54
  return ArenaWinner.UNKNOWN, [-1, -1]
58
55
  elif output_format == '[[A]]':
59
56
  if '[[A]]' in completion:
@@ -63,8 +60,7 @@ def lmsys_parser(completion, output_format):
63
60
  elif '[[C]]' in completion:
64
61
  winner = ArenaWinner.TIE
65
62
  else:
66
- logger.error(
67
- f'\nContent: {completion}\nYou must manually fix the score.')
63
+ logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
68
64
  winner = ArenaWinner.UNKNOWN
69
65
  return winner
70
66
 
@@ -76,8 +72,7 @@ def ranking_parser(completion, **kwargs):
76
72
  else:
77
73
  ordered_completions = completion
78
74
 
79
- rank = [c for c in ordered_completions
80
- if c['model'] == 'model_a'][0]['rank']
75
+ rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
81
76
  assert rank in [1, 2]
82
77
 
83
78
  return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
@@ -0,0 +1,162 @@
1
+ import json
2
+ import jsonlines as jsonl
3
+ import os
4
+ import yaml
5
+
6
+ from evalscope.constants import DumpMode
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ class OutputsStructure:
13
+ LOGS_DIR = 'logs'
14
+ PREDICTIONS_DIR = 'predictions'
15
+ REVIEWS_DIR = 'reviews'
16
+ REPORTS_DIR = 'reports'
17
+ CONFIGS_DIR = 'configs'
18
+
19
+ def __init__(self, outputs_dir: str, is_make=True):
20
+ self.outputs_dir = outputs_dir
21
+ self.is_make = is_make
22
+ self._dirs = {
23
+ 'logs_dir': None,
24
+ 'predictions_dir': None,
25
+ 'reviews_dir': None,
26
+ 'reports_dir': None,
27
+ 'configs_dir': None
28
+ }
29
+
30
+ def _get_dir(self, attr_name, dir_name):
31
+ if self._dirs[attr_name] is None:
32
+ dir_path = os.path.join(self.outputs_dir, dir_name)
33
+ if self.is_make:
34
+ os.makedirs(dir_path, exist_ok=True)
35
+ self._dirs[attr_name] = dir_path
36
+ return self._dirs[attr_name]
37
+
38
+ @property
39
+ def logs_dir(self):
40
+ return self._get_dir('logs_dir', OutputsStructure.LOGS_DIR)
41
+
42
+ @property
43
+ def predictions_dir(self):
44
+ return self._get_dir('predictions_dir', OutputsStructure.PREDICTIONS_DIR)
45
+
46
+ @property
47
+ def reviews_dir(self):
48
+ return self._get_dir('reviews_dir', OutputsStructure.REVIEWS_DIR)
49
+
50
+ @property
51
+ def reports_dir(self):
52
+ return self._get_dir('reports_dir', OutputsStructure.REPORTS_DIR)
53
+
54
+ @property
55
+ def configs_dir(self):
56
+ return self._get_dir('configs_dir', OutputsStructure.CONFIGS_DIR)
57
+
58
+
59
+ def jsonl_to_list(jsonl_file):
60
+ """
61
+ Read jsonl file to list.
62
+
63
+ Args:
64
+ jsonl_file: jsonl file path.
65
+
66
+ Returns:
67
+ list: list of lines. Each line is a dict.
68
+ """
69
+ res_list = []
70
+ with jsonl.open(jsonl_file, mode='r') as reader:
71
+ for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
72
+ res_list.append(line)
73
+ return res_list
74
+
75
+
76
+ def jsonl_to_reader(jsonl_file):
77
+ """
78
+ Read jsonl file to reader object.
79
+
80
+ Args:
81
+ jsonl_file: jsonl file path.
82
+
83
+ Returns:
84
+ reader: jsonl reader object.
85
+ """
86
+ with jsonl.open(jsonl_file, mode='r') as reader:
87
+ return reader
88
+
89
+
90
+ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
91
+ """
92
+ Dump data to jsonl file.
93
+
94
+ Args:
95
+ data_list: data list to be dumped. [{'a': 'aaa'}, ...]
96
+ jsonl_file: jsonl file path.
97
+ dump_mode: dump mode. It can be 'overwrite' or 'append'.
98
+ """
99
+ if not jsonl_file:
100
+ raise ValueError('output file must be provided.')
101
+
102
+ jsonl_file = os.path.expanduser(jsonl_file)
103
+
104
+ if not isinstance(data_list, list):
105
+ data_list = [data_list]
106
+
107
+ if dump_mode == DumpMode.OVERWRITE:
108
+ dump_mode = 'w'
109
+ elif dump_mode == DumpMode.APPEND:
110
+ dump_mode = 'a'
111
+ with jsonl.open(jsonl_file, mode=dump_mode) as writer:
112
+ writer.write_all(data_list)
113
+
114
+
115
+ def jsonl_to_csv():
116
+ pass
117
+
118
+
119
+ def yaml_to_dict(yaml_file) -> dict:
120
+ """
121
+ Read yaml file to dict.
122
+ """
123
+ with open(yaml_file, 'r') as f:
124
+ try:
125
+ stream = yaml.safe_load(f)
126
+ except yaml.YAMLError as e:
127
+ logger.error(f'{e}')
128
+ raise e
129
+
130
+ return stream
131
+
132
+
133
+ def dict_to_yaml(d: dict, yaml_file: str):
134
+ """
135
+ Dump dict to yaml file.
136
+ """
137
+ with open(yaml_file, 'w') as f:
138
+ yaml.dump(d, f, default_flow_style=False)
139
+
140
+
141
+ def json_to_dict(json_file) -> dict:
142
+ """
143
+ Read json file to dict.
144
+ """
145
+ with open(json_file, 'r') as f:
146
+ try:
147
+ stream = json.load(f)
148
+ except json.JSONDecodeError as e:
149
+ logger.error(f'{e}')
150
+ raise e
151
+
152
+ return stream
153
+
154
+
155
+ def are_paths_same(path1, path2):
156
+ """
157
+ Check if two paths are the same.
158
+ """
159
+ real_path1 = os.path.realpath(os.path.abspath(os.path.expanduser(path1)))
160
+ real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
161
+
162
+ return real_path1 == real_path2
evalscope/utils/logger.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import importlib.util as iutil
2
2
  import logging
3
+ import os
3
4
  from typing import Optional
4
5
 
5
6
  init_loggers = {}
@@ -9,11 +10,12 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
9
10
 
10
11
  detailed_formatter = logging.Formatter(detailed_format)
11
12
  simple_formatter = logging.Formatter(simple_format)
13
+ DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
12
14
 
13
- logging.basicConfig(format=simple_format, level=logging.INFO)
15
+ logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
14
16
 
15
17
 
16
- def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = 'w'):
18
+ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
17
19
  """Get logging logger
18
20
 
19
21
  Args:
@@ -29,12 +31,12 @@ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, fi
29
31
  logger.propagate = False
30
32
 
31
33
  if logger_name in init_loggers:
32
- if logger.level != log_level:
34
+ if force:
33
35
  logger.setLevel(log_level)
34
- add_file_handler_if_needed(logger, log_file, file_mode, log_level)
35
- for handler in logger.handlers:
36
- handler.setLevel(log_level)
37
- handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
36
+ for handler in logger.handlers:
37
+ handler.setLevel(log_level)
38
+ handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
39
+ add_file_handler_if_needed(logger, log_file, file_mode, log_level)
38
40
  return logger
39
41
 
40
42
  # handle duplicate logs to the console
@@ -73,6 +75,14 @@ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, fi
73
75
  return logger
74
76
 
75
77
 
78
+ def configure_logging(debug: bool, log_file: Optional[str] = None):
79
+ """Configure logging level based on the debug flag."""
80
+ if log_file:
81
+ get_logger(log_file=log_file, force=True)
82
+ if debug:
83
+ get_logger(log_level=logging.DEBUG, force=True)
84
+
85
+
76
86
  def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
77
87
  for handler in logger.handlers:
78
88
  if isinstance(handler, logging.FileHandler):