evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +2 -3
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.2.dist-info/RECORD +0 -286
  230. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import os
4
- import json
5
3
  import glob
4
+ import json
5
+ import os
6
+ from collections import defaultdict
6
7
  from tabulate import tabulate
8
+
7
9
  from evalscope.utils.logger import get_logger
8
10
 
9
11
  logger = get_logger()
10
-
11
12
  """
12
13
  Combine and generate table for reports of LLMs.
13
14
  """
@@ -15,8 +16,9 @@ Combine and generate table for reports of LLMs.
15
16
 
16
17
  def get_report(report_file: str):
17
18
  data_d: dict = json.load(open(report_file, 'r'))
18
- dataset_name = data_d['name']
19
- score = data_d['score'] # float or dict
19
+ dataset_name = data_d['dataset_name']
20
+ model_name = data_d['model_name']
21
+ score = data_d['score'] # float or dict
20
22
  score_d = {}
21
23
  if isinstance(score, dict):
22
24
  # score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
@@ -29,19 +31,16 @@ def get_report(report_file: str):
29
31
  # score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
30
32
  score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
31
33
 
32
- return {'dataset_name': dataset_name, 'score': score_str}
34
+ return model_name, {'dataset_name': dataset_name, 'score': score_str}
33
35
 
34
36
 
35
37
  def get_model_reports(model_report_dir: str):
36
38
  model_report_dir = os.path.normpath(model_report_dir)
37
- model_report_dir = model_report_dir.rstrip('reports')
38
- model_info = os.path.basename(os.path.normpath(model_report_dir))
39
- model_name = '_'.join(model_info.split('_')[:-1][3:])
40
- report_files = glob.glob(os.path.join(model_report_dir, 'reports', '*.json'))
39
+ report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
41
40
 
42
- model_reports_d = {model_name: []}
41
+ model_reports_d = defaultdict(list)
43
42
  for file_path in report_files:
44
- report_d = get_report(file_path)
43
+ model_name, report_d = get_report(file_path)
45
44
  model_reports_d[model_name].append(report_d)
46
45
 
47
46
  return model_reports_d
@@ -55,8 +54,6 @@ def gen_table(reports_path_list: list):
55
54
  for report_path in reports_path_list:
56
55
  model_reports_d = get_model_reports(report_path)
57
56
  for model_name, report_list in model_reports_d.items():
58
- # report_list: [{'dataset_name': 'CompetitionMath', 'score': '4.42 (acc)'},
59
- # {'dataset_name': 'GSM8K', 'score': '28.51 (acc)'}]
60
57
  report_list = sorted(report_list, key=lambda x: x['dataset_name'])
61
58
  if not is_headers_set:
62
59
  headers.extend([x['dataset_name'] for x in report_list])
@@ -71,37 +68,34 @@ def gen_table(reports_path_list: list):
71
68
  report_table = tabulate(table_values, headers=headers, tablefmt='grid')
72
69
  return report_table
73
70
 
71
+
74
72
  class ReportsRecorder:
75
73
  COMMON_DATASET_PATH = []
76
74
  CUSTOM_DATASET_PATH = []
77
75
 
78
- def __init__(self, oss_url: str = "", endpoint: str = ""):
76
+ def __init__(self, oss_url: str = '', endpoint: str = ''):
79
77
  if oss_url and endpoint:
80
78
  import oss2
81
79
  from oss2.credentials import EnvironmentVariableCredentialsProvider
82
80
 
83
81
  auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
84
- oss_url = oss_url.replace("oss://", "").split('/')
82
+ oss_url = oss_url.replace('oss://', '').split('/')
85
83
  bucket_name = oss_url[0]
86
84
 
87
- self.object_path = "/".join(oss_url[1:])
85
+ self.object_path = '/'.join(oss_url[1:])
88
86
  self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
89
87
  else:
90
- self.object_path = ""
88
+ self.object_path = ''
91
89
  self.bucket = None
92
90
 
93
-
94
91
  def append_path(self, report_path: str, dataset_name: str):
95
- if dataset_name == "general_qa":
92
+ if dataset_name == 'general_qa':
96
93
  self.CUSTOM_DATASET_PATH.append(report_path)
97
94
  else:
98
95
  self.COMMON_DATASET_PATH.append(report_path)
99
-
96
+
100
97
  def dump_reports(self, output_dir: str):
101
- result = {
102
- "CommonDataset": [],
103
- "CustomDataset": []
104
- }
98
+ result = {'CommonDataset': [], 'CustomDataset': []}
105
99
  for line in self.COMMON_DATASET_PATH:
106
100
  with open(line, 'r') as f:
107
101
  report = json.load(f)
@@ -109,20 +103,21 @@ class ReportsRecorder:
109
103
  for line in self.CUSTOM_DATASET_PATH:
110
104
  with open(line, 'r') as f:
111
105
  report = json.load(f)
112
- report.update({"name": os.path.basename(line)})
106
+ report.update({'name': os.path.basename(line)})
113
107
  result['CustomDataset'].append(report)
114
-
108
+
115
109
  os.makedirs(output_dir, exist_ok=True)
116
- output_file_name = "metric.json"
110
+ output_file_name = 'metric.json'
117
111
  output_path = os.path.join(output_dir, output_file_name)
118
112
  with open(output_path, 'w+') as f:
119
113
  f.write(json.dumps(result, ensure_ascii=False, indent=4))
120
-
114
+
121
115
  if self.bucket:
122
116
  remote_path = os.path.join(self.object_path, output_file_name)
123
- logger.info(f"** Upload report to oss: {remote_path}")
117
+ logger.info(f'** Upload report to oss: {remote_path}')
124
118
  self.bucket.put_object_from_file(remote_path, output_path)
125
119
 
120
+
126
121
  if __name__ == '__main__':
127
122
  report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
128
123
  report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
@@ -4,12 +4,10 @@ import time
4
4
 
5
5
  from evalscope.models.custom import CustomModel
6
6
  from evalscope.run import run_task
7
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
8
7
  from evalscope.utils import yaml_to_dict
9
8
  from evalscope.utils.logger import get_logger
10
9
 
11
10
  logger = get_logger()
12
-
13
11
  """
14
12
  This script is used to rewrite the evaluation results without re-running the model predictions.
15
13
  """
@@ -26,19 +24,20 @@ class DummyCustomModel(CustomModel):
26
24
  response = 'The answer is C. NOTE: ONLY FOR TEST'
27
25
 
28
26
  res_d: dict = {
29
- 'choices': [
30
- {
31
- 'index': 0,
32
- 'message': {
33
- # 'content': f'The answer is B. Raw prompt: {prompt}',
34
- 'content': response,
35
- 'role': 'assistant'
36
- }
27
+ 'choices': [{
28
+ 'index': 0,
29
+ 'message': {
30
+ # 'content': f'The answer is B. Raw prompt: {prompt}',
31
+ 'content': response,
32
+ 'role': 'assistant'
37
33
  }
38
- ],
39
- 'created': time.time(),
40
- 'model': self.config.get('model_id'), # should be model_id
41
- 'object': 'chat.completion',
34
+ }],
35
+ 'created':
36
+ time.time(),
37
+ 'model':
38
+ self.config.get('model_id'), # should be model_id
39
+ 'object':
40
+ 'chat.completion',
42
41
  'usage': {
43
42
  'completion_tokens': 0,
44
43
  'prompt_tokens': 0,
@@ -49,36 +48,6 @@ class DummyCustomModel(CustomModel):
49
48
  return [res_d for _ in prompts]
50
49
 
51
50
 
52
- def get_task_cfg(cfg_file: str, model_instance: CustomModel):
53
- if cfg_file:
54
- cfg_file: str = os.path.abspath(cfg_file)
55
- logger.info(f'Loading task config from {cfg_file}')
56
- task_cfg_d: dict = yaml_to_dict(yaml_file=cfg_file)
57
- task_cfg_d.update({'model': model_instance})
58
- logger.info(f'**Task config: {task_cfg_d}')
59
- else:
60
- # 默认config 示例
61
- task_cfg_d = {
62
- 'model_args': {},
63
- 'generation_config': {},
64
- 'dataset_args': {},
65
- 'dry_run': False,
66
- 'model': model_instance, # NOTE: model_id or # model_dir or model_instance(CustomModel)
67
- 'eval_type': 'custom', # NOTE: `checkpoint` or `custom` or `service`
68
- 'datasets': ['arc'],
69
- 'work_dir': DEFAULT_ROOT_CACHE_DIR,
70
- 'outputs': './outputs/eval_swift_dummy',
71
- 'mem_cache': False,
72
- 'dataset_hub': 'ModelScope',
73
- 'dataset_dir': DEFAULT_ROOT_CACHE_DIR,
74
- 'stage': 'all',
75
- 'limit': 10,
76
- 'debug': False
77
- }
78
-
79
- return task_cfg_d
80
-
81
-
82
51
  if __name__ == '__main__':
83
52
  # step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
84
53
  # step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
@@ -91,5 +60,4 @@ if __name__ == '__main__':
91
60
  task_cfg_d.update({'model': swift_model})
92
61
 
93
62
  eval_results: dict = run_task(task_cfg=task_cfg_d)
94
- print(f'** Evaluation results finished !\n')
95
-
63
+ print('** Evaluation results finished !\n')
@@ -1,4 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  from evalscope.utils.utils import *
4
- from evalscope.utils.task_utils import *
@@ -1,13 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) lmsys.org.
3
3
 
4
- import random
5
- from collections import OrderedDict, defaultdict
6
- from typing import List, Sequence, Union
7
-
8
4
  import numpy as np
9
5
  import pandas as pd
10
6
  import pyarrow as pa
7
+ import random
8
+ from collections import OrderedDict, defaultdict
9
+ from typing import List, Sequence, Union
11
10
 
12
11
  from evalscope.utils.logger import get_logger
13
12
 
@@ -25,9 +24,7 @@ def compute_elo(battles,
25
24
  init_rating=1000):
26
25
  rating = defaultdict(lambda: init_rating)
27
26
 
28
- for rd, model_a, model_b, win in battles[[
29
- col_model_a, col_model_b, col_win
30
- ]].itertuples():
27
+ for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
31
28
  ra = rating[model_a]
32
29
  rb = rating[model_b]
33
30
  ea = 1 / (1 + base**((rb - ra) / scale))
@@ -46,9 +43,7 @@ def compute_elo(battles,
46
43
  return rating
47
44
 
48
45
 
49
- def merge_ques_ans(answer_list_all,
50
- merge_key: str = 'question_id',
51
- merge_mode: str = 'inner') -> pd.DataFrame:
46
+ def merge_ques_ans(answer_list_all, merge_key: str = 'question_id', merge_mode: str = 'inner') -> pd.DataFrame:
52
47
  """
53
48
  Merge question and answer list to unifiled data.
54
49
 
@@ -67,18 +62,11 @@ def merge_ques_ans(answer_list_all,
67
62
  """
68
63
  ans_df = pd.DataFrame()
69
64
  for ans_list in answer_list_all:
70
- ans_list = [{
71
- 'question_id': item['question_id'],
72
- item['model_id']: item
73
- } for item in ans_list]
65
+ ans_list = [{'question_id': item['question_id'], item['model_id']: item} for item in ans_list]
74
66
  if ans_df.empty:
75
67
  ans_df = pa.Table.from_pylist(ans_list).to_pandas()
76
68
  else:
77
- ans_df = pd.merge(
78
- ans_df,
79
- pa.Table.from_pylist(ans_list).to_pandas(),
80
- on=merge_key,
81
- how=merge_mode)
69
+ ans_df = pd.merge(ans_df, pa.Table.from_pylist(ans_list).to_pandas(), on=merge_key, how=merge_mode)
82
70
 
83
71
  return ans_df
84
72
 
@@ -112,21 +100,17 @@ def get_battle_pairs(columns: List[str], baseline_idx: int = -1) -> List[tuple]:
112
100
 
113
101
  if baseline_idx != -1:
114
102
  n_column = columns[baseline_idx]
115
- res_list = [(column, n_column) for column in columns
116
- if column != n_column]
103
+ res_list = [(column, n_column) for column in columns if column != n_column]
117
104
  else:
118
105
  mat = np.ones((cols_num, cols_num))
119
106
  mat_lower_tril = np.tril(mat, k=-1)
120
107
  x_ids, y_ids = np.where(mat_lower_tril == 1)
121
- res_list = [(columns[x_id], columns[y_id])
122
- for x_id, y_id in zip(x_ids, y_ids)]
108
+ res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
123
109
 
124
110
  return res_list
125
111
 
126
112
 
127
- def get_battle_pairs_origin(columns: List[str],
128
- compare_base: bool = False,
129
- swap: bool = False): # TODO: to refactor
113
+ def get_battle_pairs_origin(columns: List[str], compare_base: bool = False, swap: bool = False): # TODO: to refactor
130
114
  """
131
115
  Get battle pair names from columns.
132
116
 
@@ -152,8 +136,7 @@ def get_battle_pairs_origin(columns: List[str],
152
136
  mat = np.ones((cols_num, cols_num))
153
137
  mat_lower_tril = np.tril(mat, k=-1)
154
138
  x_ids, y_ids = np.where(mat_lower_tril == 1)
155
- res_list = [(columns[x_id], columns[y_id])
156
- for x_id, y_id in zip(x_ids, y_ids)]
139
+ res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
157
140
  else:
158
141
  for column in columns[1:]:
159
142
  res_list.append((columns[0], column))
@@ -163,8 +146,7 @@ def get_battle_pairs_origin(columns: List[str],
163
146
  return res_list
164
147
 
165
148
 
166
- def shuffle_pairwise_preferences(
167
- df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
149
+ def shuffle_pairwise_preferences(df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
168
150
  """Shuffle the outputs of a pairwise preference dataframe.
169
151
 
170
152
  Examples
@@ -182,8 +164,7 @@ def shuffle_pairwise_preferences(
182
164
  df['output_2'] = np.where(arr_is_shuffle, col_1, col_2)
183
165
 
184
166
  if 'preference' in df.columns:
185
- df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'],
186
- df['preference'])
167
+ df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'], df['preference'])
187
168
 
188
169
  return df
189
170
 
@@ -202,20 +183,14 @@ class BattlePairSelection:
202
183
  # Make sure model_elo_map to be ordered when compare_base is true.
203
184
  self.model_elo_map = model_elo_map
204
185
 
205
- def top_k(self,
206
- k: int = DEFAULT_K,
207
- compare_base: bool = False,
208
- swap: bool = False) -> list:
186
+ def top_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
209
187
  if k <= 0:
210
188
  k = self.DEFAULT_K
211
189
  sorted_res = sorted(self.model_elo_map.items(), key=lambda x: x[1])[:k]
212
190
  sorted_res = list(dict(sorted_res).keys())
213
191
  return get_battle_pairs_origin(sorted_res, compare_base, swap)
214
192
 
215
- def random_k(self,
216
- k: int = DEFAULT_K,
217
- compare_base: bool = False,
218
- swap: bool = False) -> list:
193
+ def random_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
219
194
  if k <= 0:
220
195
  k = self.DEFAULT_K
221
196
  if k > len(self.model_elo_map):
@@ -226,21 +201,16 @@ class BattlePairSelection:
226
201
  res = list(res.keys())
227
202
  return get_battle_pairs_origin(res, compare_base, swap)
228
203
 
229
- def volatility_index(self,
230
- frac: float = 0.2,
231
- compare_base: bool = False,
232
- swap: bool = False) -> list:
204
+ def volatility_index(self, frac: float = 0.2, compare_base: bool = False, swap: bool = False) -> list:
233
205
  res_list = []
234
- candidate_list = get_battle_pairs_origin(
235
- list(self.model_elo_map.keys()), compare_base, swap)
206
+ candidate_list = get_battle_pairs_origin(list(self.model_elo_map.keys()), compare_base, swap)
236
207
  for t in candidate_list:
237
208
  model_a = t[0]
238
209
  model_b = t[1]
239
210
  base_elo_a = self.model_elo_map.get(model_a)
240
211
  base_elo_b = self.model_elo_map.get(model_b)
241
212
 
242
- vol_frac = abs(base_elo_b - base_elo_a) / max(
243
- base_elo_a, base_elo_b)
213
+ vol_frac = abs(base_elo_b - base_elo_a) / max(base_elo_a, base_elo_b)
244
214
  if vol_frac <= frac:
245
215
  res_list.append(t)
246
216
 
@@ -1,14 +1,13 @@
1
1
  import os
2
2
  import time
3
+ import torch
3
4
  from contextlib import contextmanager
4
5
  from functools import partial
5
- from threading import Thread
6
- from typing import List, Literal, Optional, Union
7
-
8
- import torch
9
6
  from modelscope import AutoModelForCausalLM, AutoTokenizer
10
7
  from pydantic import BaseModel, Field
8
+ from threading import Thread
11
9
  from transformers import TextIteratorStreamer
10
+ from typing import List, Literal, Optional, Union
12
11
 
13
12
 
14
13
  class Usage(BaseModel):
@@ -4,7 +4,6 @@
4
4
  import ast
5
5
  import re
6
6
 
7
-
8
7
  # from . import utils as ann_utils
9
8
  from evalscope.constants import ArenaWinner
10
9
  from evalscope.utils.logger import get_logger
@@ -51,9 +50,7 @@ def lmsys_parser(completion, output_format):
51
50
  else:
52
51
  raise Exception('Invalid score pair.')
53
52
  except Exception as e:
54
- logger.error(
55
- f'{e}\nContent: {completion}\nYou must manually fix the score pair.'
56
- )
53
+ logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
57
54
  return ArenaWinner.UNKNOWN, [-1, -1]
58
55
  elif output_format == '[[A]]':
59
56
  if '[[A]]' in completion:
@@ -63,8 +60,7 @@ def lmsys_parser(completion, output_format):
63
60
  elif '[[C]]' in completion:
64
61
  winner = ArenaWinner.TIE
65
62
  else:
66
- logger.error(
67
- f'\nContent: {completion}\nYou must manually fix the score.')
63
+ logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
68
64
  winner = ArenaWinner.UNKNOWN
69
65
  return winner
70
66
 
@@ -76,8 +72,7 @@ def ranking_parser(completion, **kwargs):
76
72
  else:
77
73
  ordered_completions = completion
78
74
 
79
- rank = [c for c in ordered_completions
80
- if c['model'] == 'model_a'][0]['rank']
75
+ rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
81
76
  assert rank in [1, 2]
82
77
 
83
78
  return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
evalscope/utils/logger.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import importlib.util as iutil
2
2
  import logging
3
+ import os
3
4
  from typing import Optional
4
5
 
5
6
  init_loggers = {}
@@ -9,11 +10,12 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
9
10
 
10
11
  detailed_formatter = logging.Formatter(detailed_format)
11
12
  simple_formatter = logging.Formatter(simple_format)
13
+ DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
12
14
 
13
- logging.basicConfig(format=simple_format, level=logging.INFO)
15
+ logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
14
16
 
15
17
 
16
- def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = 'w'):
18
+ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
17
19
  """Get logging logger
18
20
 
19
21
  Args:
@@ -29,12 +31,12 @@ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, fi
29
31
  logger.propagate = False
30
32
 
31
33
  if logger_name in init_loggers:
32
- if logger.level != log_level:
34
+ if force:
33
35
  logger.setLevel(log_level)
34
- add_file_handler_if_needed(logger, log_file, file_mode, log_level)
35
- for handler in logger.handlers:
36
- handler.setLevel(log_level)
37
- handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
36
+ for handler in logger.handlers:
37
+ handler.setLevel(log_level)
38
+ handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
39
+ add_file_handler_if_needed(logger, log_file, file_mode, log_level)
38
40
  return logger
39
41
 
40
42
  # handle duplicate logs to the console
@@ -0,0 +1,11 @@
1
+ from transformers import GenerationConfig
2
+
3
+
4
+ def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
5
+ # Use the default values of temperature/top_p/top_k in generation_config.
6
+ if generation_config.temperature == 0:
7
+ generation_config.do_sample = False
8
+ if generation_config.do_sample is False:
9
+ generation_config.temperature = 1.
10
+ generation_config.top_p = 1.
11
+ generation_config.top_k = 50