evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/run_arena.py CHANGED
@@ -3,16 +3,17 @@
3
3
 
4
4
  import argparse
5
5
  import os
6
- from pathlib import Path
7
6
  import torch
7
+ from modelscope.utils.hf_util import GenerationConfig
8
+ from pathlib import Path
8
9
  from tqdm import tqdm
9
10
 
10
11
  from evalscope.constants import EvalConfigKeys
11
12
  from evalscope.evaluator.rating_eval import RatingEvaluate
12
13
  from evalscope.models.model_adapter import ChatGenerationModelAdapter
13
- from evalscope.utils import get_obj_from_cfg, yaml_to_dict, jsonl_to_list, dump_jsonl_data
14
+ from evalscope.utils import get_obj_from_cfg
15
+ from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
14
16
  from evalscope.utils.logger import get_logger
15
- from modelscope.utils.hf_util import GenerationConfig
16
17
 
17
18
  logger = get_logger()
18
19
 
@@ -41,8 +42,7 @@ class ArenaWorkflow:
41
42
  def _get_obj_from_cfg(obj_cfg: dict):
42
43
  cls_ref = obj_cfg.get(EvalConfigKeys.CLASS_REF, None)
43
44
  if not cls_ref:
44
- logger.warning(
45
- f'Class reference is not specified in config: {obj_cfg}')
45
+ logger.warning(f'Class reference is not specified in config: {obj_cfg}')
46
46
  return obj_cfg
47
47
 
48
48
  cls = get_obj_from_cfg(cls_ref)
@@ -50,19 +50,16 @@ class ArenaWorkflow:
50
50
 
51
51
  return obj_cfg
52
52
 
53
- def _predict_answers(self,
54
- model_id_or_path: str,
55
- model_revision: str,
56
- precision: torch.dtype,
57
- generation_config: GenerationConfig,
58
- template_type: str) -> list:
53
+ def _predict_answers(self, model_id_or_path: str, model_revision: str, precision: torch.dtype,
54
+ generation_config: GenerationConfig, template_type: str) -> list:
59
55
 
60
56
  # TODO: multi-task to be supported
61
- model_adapter = ChatGenerationModelAdapter(model_id=model_id_or_path,
62
- model_revision=model_revision,
63
- torch_dtype=precision,
64
- generation_config=generation_config,
65
- template_type=template_type)
57
+ model_adapter = ChatGenerationModelAdapter(
58
+ model_id=model_id_or_path,
59
+ model_revision=model_revision,
60
+ torch_dtype=precision,
61
+ generation_config=generation_config,
62
+ template_type=template_type)
66
63
  res_list = []
67
64
  questions_list = jsonl_to_list(self.question_file)
68
65
  for data_d in tqdm(questions_list, total=len(questions_list), desc=f'Predicting(answers):'):
@@ -92,8 +89,7 @@ class ArenaWorkflow:
92
89
  for model_name, cfg_d in self.answers_gen.items():
93
90
  enable = cfg_d.get(EvalConfigKeys.ENABLE, True)
94
91
  if not enable:
95
- logger.warning(
96
- f'Skip model {model_name} because it is not enabled.')
92
+ logger.warning(f'Skip model {model_name} because it is not enabled.')
97
93
  continue
98
94
 
99
95
  model_id_or_path = cfg_d.get(EvalConfigKeys.MODEL_ID_OR_PATH)
@@ -105,11 +101,12 @@ class ArenaWorkflow:
105
101
  ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
106
102
  template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
107
103
 
108
- answers_list = self._predict_answers(model_id_or_path=model_id_or_path,
109
- model_revision=model_revision,
110
- precision=precision,
111
- generation_config=custom_generation_config,
112
- template_type=template_type)
104
+ answers_list = self._predict_answers(
105
+ model_id_or_path=model_id_or_path,
106
+ model_revision=model_revision,
107
+ precision=precision,
108
+ generation_config=custom_generation_config,
109
+ template_type=template_type)
113
110
 
114
111
  os.makedirs(os.path.dirname(ans_output_file), exist_ok=True)
115
112
  dump_jsonl_data(answers_list, ans_output_file)
@@ -163,8 +160,7 @@ class ArenaWorkflow:
163
160
  if enable:
164
161
  report_file = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
165
162
  metrics = self.rating_gen.get('metrics', ['elo'])
166
- baseline_model = self.rating_gen.get(
167
- 'baseline_model') if metrics[0] == 'pairwise' else None
163
+ baseline_model = self.rating_gen.get('baseline_model') if metrics[0] == 'pairwise' else None
168
164
  ae = RatingEvaluate(metrics=metrics, baseline_model=baseline_model)
169
165
  res_list = ae.run(self.review_file)
170
166
  rating_df = res_list[0]
evalscope/summarizer.py CHANGED
@@ -1,14 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import glob
2
3
  import json
3
4
  import os
4
- import glob
5
5
  from typing import List, Union
6
6
 
7
- from evalscope.config import TaskConfig
8
- from evalscope.constants import OutputsStructure
7
+ from evalscope.config import TaskConfig, parse_task_config
8
+ from evalscope.constants import EvalBackend
9
9
  from evalscope.tools.combine_reports import gen_table
10
- from evalscope.utils import process_outputs_structure, yaml_to_dict, EvalBackend, json_to_dict, get_latest_folder_path, \
11
- csv_to_list
10
+ from evalscope.utils import csv_to_list, get_latest_folder_path
11
+ from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
12
12
  from evalscope.utils.logger import get_logger
13
13
 
14
14
  logger = get_logger()
@@ -20,12 +20,12 @@ class Summarizer:
20
20
  def get_report(outputs_dir: str) -> List[dict]:
21
21
  res_list: list = []
22
22
 
23
- outputs_structure: dict = process_outputs_structure(outputs_dir, is_make=False)
24
- reports_dir: str = outputs_structure.get(OutputsStructure.REPORTS_DIR)
23
+ outputs_structure = OutputsStructure(outputs_dir, is_make=False)
24
+ reports_dir: str = outputs_structure.reports_dir
25
25
  if reports_dir is None:
26
26
  raise ValueError(f'No reports directory in {outputs_dir}')
27
27
 
28
- report_files: list = glob.glob(os.path.join(reports_dir, '*.json'))
28
+ report_files: list = glob.glob(os.path.join(reports_dir, '**/*.json'))
29
29
  for report_file in report_files:
30
30
  with open(report_file, 'r') as f:
31
31
  res_list.append(json.load(f))
@@ -48,39 +48,26 @@ class Summarizer:
48
48
  A report dict is overall report on a benchmark for specific model.
49
49
  """
50
50
  final_res_list: List[dict] = []
51
- candidate_task_cfgs: List[dict] = []
52
-
53
- if isinstance(task_cfg, dict):
54
- candidate_task_cfgs = [task_cfg]
55
- elif isinstance(task_cfg, str):
56
- task_cfg: dict = yaml_to_dict(task_cfg)
57
- candidate_task_cfgs = [task_cfg]
58
- elif isinstance(task_cfg, TaskConfig):
59
- task_cfg: dict = task_cfg.to_dict()
60
- candidate_task_cfgs = [task_cfg]
61
- elif isinstance(task_cfg, list):
51
+ candidate_task_cfgs: List[TaskConfig] = []
52
+
53
+ if isinstance(task_cfg, list):
62
54
  for task_cfg_item in task_cfg:
63
- if isinstance(task_cfg_item, str):
64
- task_cfg_item: dict = yaml_to_dict(task_cfg_item)
65
- elif isinstance(task_cfg_item, TaskConfig):
66
- task_cfg_item: dict = task_cfg_item.to_dict()
67
- candidate_task_cfgs.append(task_cfg_item)
55
+ candidate_task_cfgs.append(parse_task_config(task_cfg_item))
68
56
  else:
69
- raise ValueError(f'Invalid task_cfg: {task_cfg}')
57
+ candidate_task_cfgs.append(parse_task_config(task_cfg))
70
58
 
71
59
  for candidate_task in candidate_task_cfgs:
72
60
  logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
73
- eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE.value
61
+ eval_backend = candidate_task.eval_backend
74
62
 
75
- if eval_backend == EvalBackend.NATIVE.value:
76
- outputs_dir: str = candidate_task.get('outputs')
77
- outputs_dir: str = os.path.expanduser(outputs_dir)
63
+ if eval_backend == EvalBackend.NATIVE:
64
+ outputs_dir: str = os.path.expanduser(candidate_task.work_dir)
78
65
  if outputs_dir is None:
79
66
  raise ValueError(f'No outputs_dir in {task_cfg}')
80
67
  res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
81
68
  final_res_list.extend(res_list)
82
69
 
83
- elif eval_backend == EvalBackend.OPEN_COMPASS.value:
70
+ elif eval_backend == EvalBackend.OPEN_COMPASS:
84
71
  eval_config = Summarizer.parse_eval_config(candidate_task)
85
72
 
86
73
  work_dir = eval_config.get('work_dir') or 'outputs/default'
@@ -93,25 +80,25 @@ class Summarizer:
93
80
  raise ValueError(f'No summary files in {res_folder_path}')
94
81
 
95
82
  summary_file_path = summary_files[0]
96
- # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'}
83
+ # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
97
84
  summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
98
85
  final_res_list.extend(summary_res)
99
- elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
86
+ elif eval_backend == EvalBackend.VLM_EVAL_KIT:
100
87
  eval_config = Summarizer.parse_eval_config(candidate_task)
101
88
 
102
89
  work_dir = eval_config.get('work_dir') or 'outputs'
103
90
  if not os.path.exists(work_dir):
104
91
  raise ValueError(f'work_dir {work_dir} does not exist.')
105
-
92
+
106
93
  for model in eval_config['model']:
107
94
  if model['name'] == 'CustomAPIModel':
108
95
  model_name = model['type']
109
96
  else:
110
97
  model_name = model['name']
111
-
98
+
112
99
  csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
113
100
  json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
114
-
101
+
115
102
  summary_files = csv_files + json_files
116
103
  for summary_file_path in summary_files:
117
104
  if summary_file_path.endswith('csv'):
@@ -120,17 +107,17 @@ class Summarizer:
120
107
  summary_res: dict = json_to_dict(summary_file_path)
121
108
  file_name = os.path.basename(summary_file_path).split('.')[0]
122
109
  final_res_list.append({file_name: summary_res})
123
-
124
- elif eval_backend == EvalBackend.THIRD_PARTY.value:
125
- raise ValueError(f'*** The summarizer for Third party evaluation backend is not supported yet ***')
110
+
111
+ elif eval_backend == EvalBackend.THIRD_PARTY:
112
+ raise ValueError('*** The summarizer for Third party evaluation backend is not supported yet ***')
126
113
  else:
127
114
  raise ValueError(f'Invalid eval_backend: {eval_backend}')
128
115
 
129
116
  return final_res_list
130
117
 
131
118
  @staticmethod
132
- def parse_eval_config(candidate_task):
133
- eval_config: Union[str, dict] = candidate_task.get('eval_config')
119
+ def parse_eval_config(candidate_task: TaskConfig):
120
+ eval_config: Union[str, dict] = candidate_task.eval_config
134
121
  assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
135
122
 
136
123
  if isinstance(eval_config, str):
@@ -3,14 +3,18 @@
3
3
  The LongWriter supports 10,000+ Word Generation From Long Context LLMs.
4
4
  We can use the benchmark LongBench-Write focuses more on measuring the long output quality as well as the output length.
5
5
 
6
- Refer to https://github.com/THUDM/LongWriter
6
+ GitHub: [LongWriter](https://github.com/THUDM/LongWriter)
7
+
8
+ Technical Report: [Minimum Tuning to Unlock Long Output from LLMs with High Quality Data as the Key](https://arxiv.org/abs/2410.10210)
9
+
7
10
 
8
11
  ## Usage
9
12
 
10
13
  ### Installation
11
14
 
12
15
  ```bash
13
- pip install evalscope[framework]
16
+ pip install evalscope[framework] -U
17
+ pip install vllm -U
14
18
  ```
15
19
 
16
20
  ### Task configuration
@@ -24,53 +28,79 @@ task_cfg = dict(stage=['infer', 'eval_l', 'eval_q'],
24
28
  model='ZhipuAI/LongWriter-glm4-9b',
25
29
  input_data_path=None,
26
30
  output_dir='./outputs',
27
- openai_api_key=None,
28
- openai_gpt_model='gpt-4o-2024-05-13',
29
- infer_generation_kwargs={
30
- 'max_new_tokens': 32768,
31
- 'temperature': 0.5
32
- },
33
- eval_generation_kwargs={
34
- 'max_new_tokens': 1024,
35
- 'temperature': 0.5,
36
- 'stop': None
31
+ infer_config={
32
+ 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions',
33
+ 'is_chat': True,
34
+ 'verbose': False,
35
+ 'generation_kwargs': {
36
+ 'max_new_tokens': 32768,
37
+ 'temperature': 0.5,
38
+ 'repetition_penalty': 1.0
39
+ },
40
+ 'proc_num': 16,
37
41
  },
38
- proc_num=8)
42
+ eval_config={
43
+ # No need to set OpenAI info if skipping the stage `eval_q`
44
+ 'openai_api_key': None,
45
+ 'openai_api_base': 'https://api.openai.com/v1/chat/completions',
46
+ 'openai_gpt_model': 'gpt-4o-2024-05-13',
47
+ 'generation_kwargs': {
48
+ 'max_new_tokens': 1024,
49
+ 'temperature': 0.5,
50
+ 'stop': None
51
+ },
52
+ 'proc_num': 8
53
+ }
54
+ )
39
55
 
40
56
  ```
41
57
  - Arguments:
42
- - `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process.
43
- - `model`: model id on the ModelScope hub, or local model dir.
58
+ - `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process with the model-as-judge.
59
+ - `model`: model id on the ModelScope hub, or local model dir. Refer to [LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b/summary) for more details.
44
60
  - `input_data_path`: input data path, default to `None`, it means to use [longbench_write](resources/longbench_write.jsonl)
45
61
  - `output_dir`: output root directory.
46
62
  - `openai_api_key`: openai_api_key when enabling the stage `eval_q` to use `Model-as-Judge`. Default to None if not needed.
47
63
  - `openai_gpt_model`: Judge model name from OpenAI. Default to `gpt-4o-2024-05-13`
48
- - `infer_generation_kwargs`: The generation kwargs for models to be evaluated.
49
- - `eval_generation_kwargs`: The generation kwargs for judge-models.
50
- - `proc_num`: proc num.
64
+ - `generation_kwargs`: The generation configs.
65
+ - `proc_num`: process number for inference and evaluation.
51
66
 
52
67
 
53
68
  2. Configuration with json (Optional):
54
69
 
55
70
  ```json
56
71
  {
57
- "stage": ["infer", "eval_l", "eval_q"],
72
+ "stage": [
73
+ "infer",
74
+ "eval_l",
75
+ "eval_q"
76
+ ],
58
77
  "model": "ZhipuAI/LongWriter-glm4-9b",
59
78
  "input_data_path": null,
60
79
  "output_dir": "./outputs",
61
- "openai_api_key": null,
62
- "openai_gpt_model": "gpt-4o-2024-05-13",
63
- "infer_generation_kwargs": {
64
- "max_new_tokens": 32768,
65
- "temperature": 0.5
80
+ "infer_config": {
81
+ "openai_api_base": "http://127.0.0.1:8000/v1/chat/completions",
82
+ "is_chat": true,
83
+ "verbose": false,
84
+ "generation_kwargs": {
85
+ "max_new_tokens": 32768,
86
+ "temperature": 0.5,
87
+ "repetition_penalty": 1.0
88
+ },
89
+ "proc_num": 16
66
90
  },
67
- "eval_generation_kwargs": {
68
- "max_new_tokens": 1024,
69
- "temperature": 0.5,
70
- "stop": null
71
- },
72
- "proc_num": 8
91
+ "eval_config": {
92
+ "openai_api_key": null,
93
+ "openai_api_base": "https://api.openai.com/v1/chat/completions",
94
+ "openai_gpt_model": "gpt-4o-2024-05-13",
95
+ "generation_kwargs": {
96
+ "max_new_tokens": 1024,
97
+ "temperature": 0.5,
98
+ "stop": null
99
+ },
100
+ "proc_num": 8
101
+ }
73
102
  }
103
+
74
104
  ```
75
105
  Refer to [default_task.json](default_task.json) for more details.
76
106
 
@@ -82,24 +112,51 @@ stage:
82
112
  - infer
83
113
  - eval_l
84
114
  - eval_q
85
- model: ZhipuAI/LongWriter-glm4-9b
115
+ model: "ZhipuAI/LongWriter-glm4-9b"
86
116
  input_data_path: null
87
- output_dir: ./outputs
88
- openai_api_key: null
89
- openai_gpt_model: gpt-4o-2024-05-13
90
- infer_generation_kwargs:
91
- max_new_tokens: 32768
92
- temperature: 0.5
93
- eval_generation_kwargs:
94
- max_new_tokens: 1024
95
- temperature: 0.5
96
- stop: null
97
- proc_num: 8
117
+ output_dir: "./outputs"
118
+ infer_config:
119
+ openai_api_base: "http://127.0.0.1:8000/v1/chat/completions"
120
+ is_chat: true
121
+ verbose: false
122
+ generation_kwargs:
123
+ max_new_tokens: 32768
124
+ temperature: 0.5
125
+ repetition_penalty: 1.0
126
+ proc_num: 16
127
+ eval_config:
128
+ openai_api_key: null
129
+ openai_api_base: "https://api.openai.com/v1/chat/completions"
130
+ openai_gpt_model: "gpt-4o-2024-05-13"
131
+ generation_kwargs:
132
+ max_new_tokens: 1024
133
+ temperature: 0.5
134
+ stop: null
135
+ proc_num: 8
98
136
 
99
137
  ```
100
138
  Refer to [default_task.yaml](default_task.yaml) for more details.
101
139
 
102
140
 
141
+ ### Run Model Inference
142
+ We recommend to use the [vLLM](https://github.com/vllm-project/vllm) to deploy the model.
143
+
144
+ Environment:
145
+ * A100(80G) x 1
146
+
147
+
148
+ To start vLLM server, run the following command:
149
+ ```shell
150
+ CUDA_VISIBLE_DEVICES=0 VLLM_USE_MODELSCOPE=True vllm serve --max-model-len=65536 --gpu_memory_utilization=0.95 --trust-remote-code ZhipuAI/LongWriter-glm4-9b
151
+ ```
152
+ - Arguments:
153
+ - `max-model-len`: The maximum length of the model input.
154
+ - `gpu_memory_utilization`: The GPU memory utilization.
155
+ - `trust-remote-code`: Whether to trust the remote code.
156
+ - `model`: Could be a model id on the ModelScope/HuggingFace hub, or a local model dir.
157
+
158
+ * Note: You can use multiple GPUs by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` alternatively.
159
+
103
160
 
104
161
  ### Run the task
105
162
 
@@ -14,7 +14,7 @@
14
14
  }
15
15
  },
16
16
  "eval_config": {
17
- "openai_api_key": "YOUR_OPENAI_API_KEY",
17
+ "openai_api_key": null,
18
18
  "openai_api_base": "https://api.openai.com/v1/chat/completions",
19
19
  "openai_gpt_model": "gpt-4o-2024-05-13",
20
20
  "generation_kwargs": {
@@ -2,23 +2,24 @@ stage:
2
2
  - infer
3
3
  - eval_l
4
4
  - eval_q
5
- model: ZhipuAI/LongWriter-glm4-9b
5
+ model: "ZhipuAI/LongWriter-glm4-9b"
6
6
  input_data_path: null
7
- output_dir: './outputs'
7
+ output_dir: "./outputs"
8
8
  infer_config:
9
- openai_api_base: 'http://127.0.0.1:8000/v1/chat/completions'
9
+ openai_api_base: "http://127.0.0.1:8000/v1/chat/completions"
10
10
  is_chat: true
11
11
  verbose: false
12
12
  generation_kwargs:
13
13
  max_new_tokens: 32768
14
14
  temperature: 0.5
15
15
  repetition_penalty: 1.0
16
+ proc_num: 16
16
17
  eval_config:
17
- openai_api_key: 'YOUR_OPENAI_API_KEY'
18
- openai_api_base: 'https://api.openai.com/v1/chat/completions'
19
- openai_gpt_model: 'gpt-4o-2024-05-13'
18
+ openai_api_key: null
19
+ openai_api_base: "https://api.openai.com/v1/chat/completions"
20
+ openai_gpt_model: "gpt-4o-2024-05-13"
20
21
  generation_kwargs:
21
22
  max_new_tokens: 1024
22
23
  temperature: 0.5
23
24
  stop: null
24
- proc_num: 16
25
+ proc_num: 8
@@ -1,19 +1,17 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) ZhipuAI, Inc. and its affiliates.
3
- import multiprocessing
4
- import os
5
3
  import json
6
- import random
7
- import re
8
- from concurrent.futures import ThreadPoolExecutor
9
-
10
4
  import matplotlib.pyplot as plt
11
5
  import numpy as np
6
+ import os
7
+ import random
8
+ import re
12
9
  import requests
10
+ from concurrent.futures import ThreadPoolExecutor
13
11
  from tqdm import tqdm
14
12
 
15
- from evalscope.utils import jsonl_to_list
16
13
  from evalscope.utils import get_logger
14
+ from evalscope.utils.io_utils import jsonl_to_list
17
15
 
18
16
  logger = get_logger()
19
17
 
@@ -52,14 +50,16 @@ class EvalLength:
52
50
  return 100 * max(0, 1. - (x / y - 1) / 2)
53
51
 
54
52
  def eval(self, dump_res: bool = True):
55
- # example = {"prompt": "Write an outline for a short 100-word blog post about xxx", "type": "Community Forum", "length": 100, "response_length": 103, "response": "I. Introduction A. xxx"}
53
+ # example = {"prompt": "Write an outline for a short 100-word blog post about xxx",
54
+ # "type": "Community Forum", "length": 100, "response_length": 103,
55
+ # "response": "I. Introduction A. xxx"}
56
56
  predictions = [json.loads(line) for line in open(self.pred_path, encoding='utf-8')]
57
57
  x, y, scores = [], [], []
58
58
 
59
- for pred in tqdm(predictions, total=len(predictions), desc=f'Process of eval_l: '):
60
- x.append(pred["length"])
61
- y.append(pred["response_length"])
62
- scores.append(self.score(pred["length"], pred["response_length"]))
59
+ for pred in tqdm(predictions, total=len(predictions), desc='[Processing eval_l]'):
60
+ x.append(pred['length'])
61
+ y.append(pred['response_length'])
62
+ scores.append(self.score(pred['length'], pred['response_length']))
63
63
 
64
64
  avg_score_l = np.mean(scores)
65
65
  logger.info(f'Average score of length evaluation: {avg_score_l:.2f}')
@@ -105,7 +105,7 @@ class EvalQuality:
105
105
 
106
106
  EVAL_Q = 'eval_quality'
107
107
  OPENAI_BASE_URL = 'https://api.openai.com/v1/chat/completions'
108
- DIMS = ["Relevance", "Accuracy", "Coherence", "Clarity", "Breadth and Depth", "Reading Experience"]
108
+ DIMS = ['Relevance', 'Accuracy', 'Coherence', 'Clarity', 'Breadth and Depth', 'Reading Experience']
109
109
 
110
110
  def __init__(self,
111
111
  model: str,
@@ -144,7 +144,8 @@ class EvalQuality:
144
144
 
145
145
  self.openai_api_key: str = openai_api_key
146
146
  self.openai_gpt_model = openai_gpt_model
147
- assert self.openai_api_key, 'Please set `OPENAI_API_KEY` in environment variables.'
147
+ if not self.openai_api_key:
148
+ logger.error('Please set `OPENAI_API_KEY` in the envs when stage `eval_q` is activated!')
148
149
 
149
150
  def get_response_gpt4(self, prompt, temperature=0.5, max_new_tokens=1024, stop=None):
150
151
  tries = 0
@@ -152,17 +153,17 @@ class EvalQuality:
152
153
  tries += 1
153
154
  try:
154
155
  headers = {
155
- 'Authorization': "Bearer {}".format(self.openai_api_key),
156
+ 'Authorization': 'Bearer {}'.format(self.openai_api_key),
156
157
  }
157
158
  messages = [
158
159
  {'role': 'user', 'content': prompt},
159
160
  ]
160
161
  resp = requests.post(self.openai_api_base, json={
161
- "model": self.openai_gpt_model,
162
- "messages": messages,
163
- "temperature": temperature,
164
- "max_tokens": max_new_tokens,
165
- "stop": stop,
162
+ 'model': self.openai_gpt_model,
163
+ 'messages': messages,
164
+ 'temperature': temperature,
165
+ 'max_tokens': max_new_tokens,
166
+ 'stop': stop,
166
167
  }, headers=headers, timeout=600)
167
168
  if resp.status_code != 200:
168
169
  raise Exception(resp.text)
@@ -172,16 +173,16 @@ class EvalQuality:
172
173
  except KeyboardInterrupt as e:
173
174
  raise e
174
175
  except Exception as e:
175
- if "maximum context length" in str(e):
176
+ if 'maximum context length' in str(e):
176
177
  raise e
177
- elif "triggering" in str(e):
178
+ elif 'triggering' in str(e):
178
179
  return 'Trigger OpenAI\'s content management policy'
179
180
  logger.error("Error Occurs: \"%s\" Retry ..." % (str(e)))
180
181
  else:
181
- logger.error("Max tries. Failed.")
182
- return "Max tries. Failed."
182
+ logger.error('Max tries. Failed.')
183
+ return 'Max tries. Failed.'
183
184
  try:
184
- return resp["choices"][0]["message"]["content"]
185
+ return resp['choices'][0]['message']['content']
185
186
  except:
186
187
  return ''
187
188
 
@@ -195,7 +196,7 @@ class EvalQuality:
195
196
 
196
197
  def process_data(self, item):
197
198
  # for item in tqdm(items, total=len(items), desc=f'Process of eval_q: '):
198
- prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item["response"])
199
+ prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item['response'])
199
200
  scores = None
200
201
  output = self.get_response_gpt4(prompt, **self.generation_kwargs)
201
202
  try:
@@ -235,7 +236,8 @@ class EvalQuality:
235
236
  total_score = dict()
236
237
  for dim in self.DIMS:
237
238
  # scores = [float(score[dim]) if dim in score else 3 for score in self.eval_scores]
238
- scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores'] else 3 for item in self.eval_scores]
239
+ scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores']
240
+ else 3 for item in self.eval_scores]
239
241
  total_score[dim] = ((sum(scores) / len(scores)) - 1) * 25
240
242
  total_score['total'] = sum(total_score.values()) / len(total_score)
241
243
  logger.info(f'Total score of quality evaluation: {total_score["total"]:.2f}')