evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +2 -3
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.2.dist-info/RECORD +0 -286
  230. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,26 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ import json
3
4
  import os
4
5
  import time
5
- import json
6
- import re
7
- from copy import deepcopy
8
6
  from collections import OrderedDict
9
-
7
+ from copy import deepcopy
10
8
  from tqdm import tqdm
11
- from typing import Optional, List, Any, Union, Dict
9
+ from typing import Any, Dict, List, Optional, Union
12
10
 
13
11
  from evalscope.benchmarks import DataAdapter
14
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR, OutputsStructure, AnswerKeys, ReviewKeys, EvalStage
12
+ from evalscope.config import TaskConfig
13
+ from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
14
+ OutputsStructure, ReviewKeys)
15
15
  from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
16
16
  from evalscope.tools.combine_reports import gen_table
17
- from evalscope.utils import gen_hash, dict_torch_dtype_to_str, dump_jsonl_data, process_outputs_structure, \
18
- normalize_score, dict_to_yaml, jsonl_to_list
17
+ from evalscope.utils import dict_torch_dtype_to_str, dump_jsonl_data, gen_hash, jsonl_to_list
19
18
  from evalscope.utils.logger import get_logger
20
19
 
21
20
  logger = get_logger()
22
21
 
23
22
 
24
23
  class Evaluator(object):
25
-
26
24
  """
27
25
  The evaluator for model on datasets.
28
26
 
@@ -33,11 +31,8 @@ class Evaluator(object):
33
31
  data_adapter: DataAdapter, the data adapter for the dataset.
34
32
  subset_list: list, the subset list for the dataset.
35
33
  model_adapter: BaseModelAdapter, the model adapter for the model.
36
- use_cache: bool, whether to use local cache. Default: True
37
- mem_cache_method: str, the memory cache method. Default: 'ttl' (deprecated)
38
- root_cache_dir: str, the root cache dir. Default: DEFAULT_ROOT_CACHE_DIR
39
- outputs_dir: str, the outputs dir. Default: ''
40
- is_custom_outputs_dir: bool, whether to use custom outputs dir. Default: False (deprecated)
34
+ use_cache: str, path to local cache. Default: None
35
+ outputs_dir: OutputsStructure, the outputs dir. Default: None
41
36
  datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
42
37
  datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
43
38
  stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
@@ -51,24 +46,20 @@ class Evaluator(object):
51
46
  data_adapter: DataAdapter,
52
47
  subset_list: Optional[list] = None,
53
48
  model_adapter: Optional[BaseModelAdapter] = None,
54
- use_cache: bool = True,
55
- mem_cache_method: str = 'ttl',
56
- root_cache_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
57
- outputs_dir: Optional[str] = '',
58
- is_custom_outputs_dir: bool = False,
59
- datasets_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
60
- datasets_hub: Optional[str] = 'ModelScope',
61
- stage: Optional[str] = 'all', # refer to evalscope.constants.EvalStage
62
- eval_type: Optional[str] = 'checkpoint', # `checkpoint` or `service` or `custom`
63
- overall_task_cfg: Optional[dict] = None,
49
+ use_cache: Optional[str] = None,
50
+ outputs: Optional[OutputsStructure] = None,
51
+ datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
52
+ datasets_hub: Optional[str] = HubType.MODELSCOPE,
53
+ stage: Optional[str] = EvalStage.ALL,
54
+ eval_type: Optional[str] = EvalType.CHECKPOINT,
55
+ overall_task_cfg: Optional[TaskConfig] = None,
64
56
  **kwargs):
65
57
 
66
58
  self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
67
- self.custom_task_name: str = None
68
- if os.path.exists(self.dataset_name_or_path):
69
- self.custom_task_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
59
+ self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
60
+ self.model_name = os.path.basename(str(overall_task_cfg.model).rstrip(os.sep))
61
+ self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
70
62
 
71
- self.root_cache_dir = os.path.expanduser(root_cache_dir)
72
63
  self.datasets_dir = os.path.expanduser(datasets_dir)
73
64
  self.kwargs = kwargs
74
65
  self.data_adapter = data_adapter
@@ -78,70 +69,31 @@ class Evaluator(object):
78
69
  self.use_cache = use_cache
79
70
  self.overall_task_cfg = overall_task_cfg
80
71
  if isinstance(self.model_adapter, CustomModelAdapter):
81
- self.overall_task_cfg.update({'custom_config': self.model_adapter.custom_model.config})
72
+ self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
82
73
 
83
74
  self.model_cfg = self.model_adapter.model_cfg
84
- self.model_id = self.model_cfg['model_id']
85
- self.model_revision = self.model_cfg.get('revision', None)
86
- self.model_revision_str = self.model_revision if self.model_revision is not None else 'none'
87
-
88
- # Get default outputs_dir
89
- # TODO: refactor outputs_dir, del timestamp concat
90
- # if not is_custom_outputs_dir:
91
- # outputs_dir = make_outputs_dir(work_dir=outputs_dir,
92
- # model_id=self.model_id,
93
- # model_revision=self.model_revision_str)
94
-
95
- self.outputs_dir = os.path.expanduser(outputs_dir)
96
75
 
97
76
  # Deal with the output paths
98
- self.outputs_structure = process_outputs_structure(self.outputs_dir)
77
+ self.outputs_structure = outputs
99
78
 
100
79
  # Load dataset
101
- self.dataset = self.data_adapter.load(dataset_name_or_path=dataset_name_or_path,
102
- subset_list=subset_list,
103
- work_dir=self.datasets_dir,
104
- datasets_hub=datasets_hub,
105
- **kwargs)
80
+ self.dataset = self.data_adapter.load(
81
+ dataset_name_or_path=dataset_name_or_path,
82
+ subset_list=subset_list,
83
+ work_dir=self.datasets_dir,
84
+ datasets_hub=datasets_hub,
85
+ **kwargs)
106
86
 
107
87
  # Get prompts from dataset
108
88
  self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
109
89
  del self.dataset
110
90
 
111
- # Init memory cache
112
- # TODO: refactor mem cache manager
113
- # mem_cache_file_name = self.dataset_name_or_path.replace('/', '_') + \
114
- # '_' + self.model_id.replace('/', '_') + \
115
- # '_' + self.model_revision_str + \
116
- # '_cache.pkl'
117
- # self.mem_cache_path = os.path.join(self.root_cache_dir, 'mem_cache', mem_cache_file_name)
118
-
119
- # Note: mem_cache is deprecated, use `use_cache` instead
120
- self.mem_cache = None
121
- self.mem_cache_method = mem_cache_method
122
- # if self.use_cache:
123
- # self.mem_cache = init_mem_cache(method=self.mem_cache_method, cache_file_path=self.mem_cache_path)
124
- # logger.info(f'** Using memory cache with size: {len(self.mem_cache)}')
125
-
126
- def _pred_answer(self,
127
- input_d: dict,
128
- infer_cfg: dict,
129
- subset_name: str,
130
- answer_id: str = None) -> dict:
131
-
132
- # Get answer from memory cache
133
- if self.mem_cache is not None:
134
- if answer_id in self.mem_cache:
135
- logger.info(f'** Reusing answer `{answer_id}` in memory cache.')
136
- return self.mem_cache[answer_id]
91
+ def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
137
92
 
138
93
  ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
139
94
  ans[AnswerKeys.ANSWER_ID] = answer_id
140
95
  ans[AnswerKeys.SUBSET_NAME] = subset_name
141
96
 
142
- if self.mem_cache is not None:
143
- self.mem_cache[answer_id] = ans
144
-
145
97
  return ans
146
98
 
147
99
  def get_answers(self,
@@ -177,26 +129,21 @@ class Evaluator(object):
177
129
  assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
178
130
 
179
131
  answers_list = []
180
- pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
181
-
182
- if self.custom_task_name:
183
- pred_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
184
- else:
185
- pred_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
186
-
187
- pred_file_path: str = os.path.join(pred_dir, pred_file_name)
132
+ pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
133
+ pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
134
+ os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
188
135
 
189
136
  if self.use_cache and os.path.exists(pred_file_path):
190
137
  answers_list = jsonl_to_list(pred_file_path)
191
- logger.info(f'** Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
192
-
193
- return answers_list
138
+ logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
139
+ # Note: assume prediction in order of prompts_list
140
+ prompts_list = prompts_list[len(answers_list):]
194
141
 
195
142
  if isinstance(self.model_adapter, CustomModelAdapter):
196
143
  # Batch inference for custom model
197
144
 
198
- resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(inputs=prompts_list,
199
- infer_cfg=infer_cfg)
145
+ resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
146
+ inputs=prompts_list, infer_cfg=infer_cfg)
200
147
 
201
148
  assert len(prompts_list) == len(resp_answers_list), \
202
149
  f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
@@ -207,10 +154,10 @@ class Evaluator(object):
207
154
  model_cfg_str = json.dumps(
208
155
  OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
209
156
  ensure_ascii=False)
210
- input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())),
211
- ensure_ascii=False)
212
- infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())),
213
- ensure_ascii=False)
157
+ input_prompt_str = json.dumps(
158
+ OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
159
+ infer_cfg_str = json.dumps(
160
+ OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
214
161
  answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
215
162
 
216
163
  resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
@@ -220,6 +167,7 @@ class Evaluator(object):
220
167
  resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
221
168
 
222
169
  answers_list.append(resp_d)
170
+ dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
223
171
 
224
172
  else:
225
173
  for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
@@ -228,17 +176,15 @@ class Evaluator(object):
228
176
  model_cfg_str = json.dumps(
229
177
  OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
230
178
  ensure_ascii=False)
231
- input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())),
232
- ensure_ascii=False)
233
- infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())),
234
- ensure_ascii=False)
179
+ input_prompt_str = json.dumps(
180
+ OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
181
+ infer_cfg_str = json.dumps(
182
+ OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
235
183
  answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
236
184
 
237
185
  # Get answers
238
- answer_d: dict = self._pred_answer(input_d=input_prompt,
239
- infer_cfg=infer_cfg,
240
- subset_name=subset_name,
241
- answer_id=answer_id)
186
+ answer_d: dict = self._pred_answer(
187
+ input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
242
188
 
243
189
  answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
244
190
  answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
@@ -249,26 +195,12 @@ class Evaluator(object):
249
195
  logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
250
196
 
251
197
  answers_list.append(answer_d)
198
+ dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
252
199
 
253
- if len(answers_list) == 0:
254
- logger.error(f'** Got empty predictions on subset {subset_name} of dataset: {self.dataset_name_or_path}')
255
-
256
- # Dump answers
257
- os.makedirs(pred_dir, exist_ok=True)
258
- dump_jsonl_data(answers_list, pred_file_path)
259
-
200
+ logger.info(f'Dump predictions to {pred_file_path}.')
260
201
  return answers_list
261
202
 
262
- def _get_review(self,
263
- answer_d: dict,
264
- review_id: str = None,
265
- reviewer_spec: dict = None) -> dict:
266
-
267
- # Get review from memory cache
268
- if self.mem_cache is not None:
269
- if review_id in self.mem_cache:
270
- logger.info(f'** Reusing review `{review_id}` in memory cache.')
271
- return self.mem_cache[review_id]
203
+ def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
272
204
 
273
205
  if reviewer_spec is None:
274
206
  reviewer_spec = {}
@@ -286,15 +218,16 @@ class Evaluator(object):
286
218
  for choice in choices:
287
219
  raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
288
220
  answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
289
- answer_content = self.data_adapter.parse_pred_result(result=answer_content,
290
- raw_input_d=raw_input_d,
291
- eval_type=self.eval_type)
221
+ answer_content = self.data_adapter.parse_pred_result(
222
+ result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
292
223
  gold_content = self.data_adapter.get_gold_answer(raw_input_d)
293
224
 
294
225
  review_result = self.data_adapter.match(gold_content, answer_content)
295
- choice[ReviewKeys.REVIEW] = {ReviewKeys.GOLD: gold_content,
296
- ReviewKeys.PRED: answer_content,
297
- ReviewKeys.RESULT: review_result}
226
+ choice[ReviewKeys.REVIEW] = {
227
+ ReviewKeys.GOLD: gold_content,
228
+ ReviewKeys.PRED: answer_content,
229
+ ReviewKeys.RESULT: review_result
230
+ }
298
231
 
299
232
  rev_choices.append(choice)
300
233
 
@@ -304,9 +237,6 @@ class Evaluator(object):
304
237
  review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
305
238
  review_res[ReviewKeys.REVIEW_TIME] = time.time()
306
239
 
307
- if self.mem_cache is not None:
308
- self.mem_cache[review_id] = review_res
309
-
310
240
  return review_res
311
241
 
312
242
  def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
@@ -324,26 +254,25 @@ class Evaluator(object):
324
254
  """
325
255
  reviews_list = []
326
256
 
327
- review_dir: str = self.outputs_structure.get(OutputsStructure.REVIEWS_DIR)
328
- if self.custom_task_name:
329
- review_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
330
- else:
331
- review_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
332
- review_file_path: str = os.path.join(review_dir, review_file_name)
257
+ review_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
258
+ review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
259
+ os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
333
260
 
334
261
  if self.use_cache and os.path.exists(review_file_path):
335
- logger.warning(f'** Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
262
+ logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
336
263
 
337
264
  for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
338
265
 
339
266
  # Gen review_id (concat: answer_id + reviewer_spec)
340
267
  answer_id = answer_d[AnswerKeys.ANSWER_ID]
341
268
 
342
- reviewer_spec: dict = {'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
343
- 'reviewer': ['Evaluator'],
344
- 'revision': ['default']}
345
- reviewer_spec_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())),
346
- ensure_ascii=False)
269
+ reviewer_spec: dict = {
270
+ 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
271
+ 'reviewer': ['Evaluator'],
272
+ 'revision': ['default']
273
+ }
274
+ reviewer_spec_str = json.dumps(
275
+ OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
347
276
  review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
348
277
 
349
278
  # Get review
@@ -354,9 +283,8 @@ class Evaluator(object):
354
283
 
355
284
  reviews_list.append(review_d)
356
285
 
357
- # Dump reviews
358
- os.makedirs(review_dir, exist_ok=True)
359
- dump_jsonl_data(reviews_list, review_file_path)
286
+ # Dump reviews
287
+ dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
360
288
 
361
289
  return reviews_list
362
290
 
@@ -375,7 +303,7 @@ class Evaluator(object):
375
303
  review_res_list = []
376
304
  for review_d in reviews_list:
377
305
  if not review_d[ReviewKeys.REVIEWED]:
378
- logger.warning(f'** Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
306
+ logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
379
307
  continue
380
308
 
381
309
  review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
@@ -385,7 +313,7 @@ class Evaluator(object):
385
313
 
386
314
  return metric_score
387
315
 
388
- def dump_report(self, report_map: dict, use_table: bool = True):
316
+ def dump_report(self, reviews_score_all: dict, use_table: bool = True):
389
317
  """
390
318
  Get report for total reviews of specific dataset.
391
319
  It is required to rewrite this method to support your own evaluator.
@@ -396,50 +324,31 @@ class Evaluator(object):
396
324
 
397
325
  Returns: None
398
326
  """
327
+ # Get report map
328
+ report_map: dict = self.data_adapter.gen_report(
329
+ subset_score_map=reviews_score_all, report_name=self.custom_task_name)
330
+ report_map.update(dict(model_name=self.model_name, dataset_name=self.dataset_name))
399
331
 
400
332
  # Dump report
401
- report_dir: str = self.outputs_structure[OutputsStructure.REPORTS_DIR]
402
-
403
- if self.custom_task_name:
404
- report_file_name: str = self.custom_task_name + '.json'
405
- else:
406
- report_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '.json'
333
+ report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
334
+ self.dataset_name + '.json')
335
+ os.makedirs(os.path.dirname(report_path), exist_ok=True)
407
336
 
408
- os.makedirs(report_dir, exist_ok=True)
409
- report_path: str = os.path.join(report_dir, report_file_name)
337
+ # Write report
410
338
  with open(report_path, 'w') as f:
411
339
  f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
412
- # logger.info(f'** Dump report to {report_path} \n')
413
- logger.info(f'** Dump report: {report_file_name} \n')
340
+ logger.info(f'Dump report: {report_path} \n')
414
341
 
342
+ # Make table
415
343
  if use_table:
416
344
  try:
417
- # Make table
418
- report_table: str = gen_table([report_dir])
419
- logger.info(f'** Report table: \n {report_table} \n')
420
- except:
345
+ report_table: str = gen_table([self.outputs_structure.reports_dir])
346
+ logger.info(f'Report table: \n{report_table} \n')
347
+ except Exception:
421
348
  logger.error('Failed to generate report table.')
349
+ return report_map
422
350
 
423
- # def save_cache(self):
424
- # if self.mem_cache is not None:
425
- # logger.info(f'** Saving memory cache with size: {len(self.mem_cache)}')
426
- # Cache.save(cache=self.mem_cache, path=self.mem_cache_path)
427
-
428
- # def clear_cache(self):
429
- # """
430
- # Clear memory cache.
431
- #
432
- # Returns: None
433
- # """
434
- # if self.mem_cache is not None:
435
- # cache_len = len(self.mem_cache)
436
- # self.mem_cache.clear()
437
- # logger.info(f'** Memory cache cleared, length changed: {cache_len} -> {len(self.mem_cache)}')
438
-
439
- def eval(self,
440
- infer_cfg: dict = None,
441
- debug: bool = False,
442
- **kwargs) -> dict:
351
+ def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
443
352
  """
444
353
  Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
445
354
  It is required to rewrite this method to support your own evaluator.
@@ -465,27 +374,22 @@ class Evaluator(object):
465
374
 
466
375
  logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
467
376
 
468
- reviews_score_all = {} # {subset_name: (score, num)}
377
+ reviews_score_all = {} # {subset_name: (score, num)}
469
378
  stage_answers_dict = {}
470
379
  stage_reviews_dict = {}
471
380
 
472
381
  for subset_name, prompts_list in self.prompts.items():
473
- limit = infer_cfg.get('limit', len(prompts_list))
382
+ limit = kwargs.get('limit', len(prompts_list))
474
383
  prompts_list = prompts_list[:limit]
475
384
 
476
- answers_list: list = self.get_answers(subset_name=subset_name,
477
- prompts_list=prompts_list,
478
- infer_cfg=infer_cfg,
479
- debug=debug,
480
- **kwargs)
385
+ answers_list: list = self.get_answers(
386
+ subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
481
387
  if self.stage == EvalStage.INFER:
482
388
  stage_answers_dict[subset_name] = answers_list
483
389
  continue
484
390
 
485
- reviews_list: list = self.get_reviews(subset_name=subset_name,
486
- answers_list=answers_list,
487
- debug=debug,
488
- **kwargs)
391
+ reviews_list: list = self.get_reviews(
392
+ subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
489
393
 
490
394
  metric_res = self.compute_metrics(reviews_list=reviews_list)
491
395
  reviews_score_all[subset_name] = (metric_res, len(reviews_list))
@@ -498,193 +402,8 @@ class Evaluator(object):
498
402
  return stage_reviews_dict
499
403
 
500
404
  # Generate report
501
- report_map: dict = self.data_adapter.gen_report(subset_score_map=reviews_score_all,
502
- report_name=self.custom_task_name)
503
- self.dump_report(report_map=report_map)
504
-
505
- # Dump overall task config
506
- overall_task_cfg_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.CONFIGS_DIR),
507
- 'task_output_config.yaml')
508
- overall_task_cfg_file = os.path.abspath(overall_task_cfg_file)
509
-
510
- # TODO: check the robustness of dump yaml
511
- try:
512
- logger.info(f'** Dump overall task config to {overall_task_cfg_file}')
513
- logger.info(f'** The overall task config:\n {self.overall_task_cfg}')
514
- if 'model' in self.overall_task_cfg and not isinstance(self.overall_task_cfg['model'], str):
515
- self.overall_task_cfg['model'] = None
516
- logger.info(f'>> Overwrite overall_task_cfg for `model` due to it is not a string')
517
- if 'model_args' in self.overall_task_cfg and self.overall_task_cfg.get('model_args') is not None:
518
- self.overall_task_cfg['model_args'].update({'precision': str(self.overall_task_cfg['model_args']['precision'])})
519
- logger.info(f'>> Overwrite overall_task_cfg for `model_args.precision` due to it is not a string')
520
-
521
- dict_to_yaml(self.overall_task_cfg, overall_task_cfg_file)
522
- except Exception as e:
523
- logger.warning(f'Failed to dump overall task config: {e}')
524
-
525
- # Note: deprecated
526
- # self.save_cache()
527
- # self.clear_cache()
528
-
529
- logger.info(f'\n**** Evaluation finished on {self.dataset_name_or_path} ****\n')
530
-
531
- return report_map
532
-
533
-
534
- class HumanevalEvaluator(object):
405
+ report_map = self.dump_report(reviews_score_all)
535
406
 
536
- def __init__(self,
537
- problem_file: str,
538
- model_id: str,
539
- model_revision: str,
540
- model_adapter: BaseModelAdapter,
541
- outputs_dir: Optional[str] = '',
542
- is_custom_outputs_dir: bool = False,
543
- k: List[int] = [1, 10, 100],
544
- n_workers: int = 4,
545
- timeout: float = 3.0,):
546
- try:
547
- from human_eval.evaluation import evaluate_functional_correctness
548
- from human_eval.data import read_problems, write_jsonl
549
- except ImportError:
550
- raise ImportError('Please install human_eval:'
551
- 'https://github.com/openai/human-eval/tree/master#installation , '
552
- 'Note that you need to enable the execution code in the human_eval/execution.py first.')
553
-
554
- self.problem_file = problem_file
555
- self.k = k
556
- self.num_workers = n_workers
557
- self.timeout = timeout
558
- self.model_adapter = model_adapter
559
-
560
- self.read_problems_func = read_problems
561
- self.write_jsonl_func = write_jsonl
562
- self.eval_func = evaluate_functional_correctness
563
-
564
- # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
565
- self.problems = self.read_problems_func(self.problem_file)
566
-
567
- # Get default outputs_dir
568
- model_revision_str: str = model_revision if model_revision is not None else 'none'
569
- # if not is_custom_outputs_dir:
570
- # outputs_dir = make_outputs_dir(work_dir=outputs_dir,
571
- # model_id=model_id,
572
- # model_revision=model_revision_str)
573
- self.outputs_dir = os.path.expanduser(outputs_dir)
574
-
575
- # Deal with the output paths
576
- self.outputs_structure = process_outputs_structure(self.outputs_dir)
577
-
578
- def get_answers(self, infer_cfg: dict) -> List[dict]:
579
- ans_list: list = []
580
- system_prompt: str = 'Complete the following python code:\n'
581
- for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
582
- prompt: str = system_prompt + data_d['prompt']
583
- inputs: dict = {'data': [prompt]}
584
- # pred_res: dict = self.model_adapter.predict(inputs)
585
-
586
- pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
587
-
588
- pred_ans: str = pred_res['choices'][0]['message']['content']
589
- pred_ans = self._postprocess(pred_ans)
590
-
591
- ans_list.append({'task_id': task_id, 'completion': pred_ans})
592
-
593
- return ans_list
594
-
595
- def eval(self, infer_cfg: dict, **kwargs):
407
+ logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
596
408
 
597
- # predict
598
- ans_list: list = self.get_answers(infer_cfg)
599
- ans_out_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR),
600
- 'human_eval_predictions.jsonl')
601
-
602
- self.write_jsonl_func(filename=ans_out_file, data=ans_list)
603
- # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
604
- logger.info('** Dump predictions successfully.')
605
-
606
- # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
607
- results = self.eval_func(sample_file=ans_out_file,
608
- k=self.k,
609
- n_workers=self.num_workers,
610
- timeout=self.timeout,
611
- problem_file=self.problem_file)
612
-
613
- # output: report
614
- report_map: dict = self.gen_report(results=results)
615
- report_dir: str = self.outputs_structure.get(OutputsStructure.REPORTS_DIR)
616
- report_file: str = os.path.join(report_dir, 'human_eval_report.json')
617
-
618
- with open(report_file, 'w') as f:
619
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
620
- # logger.info(f'** Dump report to {report_file} \n')
621
- logger.info(f'** Dump report \n')
622
-
623
- try:
624
- # Make table
625
- report_table: str = gen_table([report_dir])
626
- logger.info(f'** Report table: \n {report_table} \n')
627
- except:
628
- logger.error('Failed to generate report table.')
629
-
630
- def gen_report(self, results: dict) -> dict:
631
- """
632
- Generate report from evaluation results.
633
-
634
- Returns:
635
- {
636
- "name":"ARC-Challenge",
637
- "metric":"WeightedAverageAccuracy",
638
- "score":0.3389,
639
- "category":[
640
- {
641
- "name":"DEFAULT",
642
- "score":0.3389,
643
- "subset":[
644
- {
645
- "name":"ARC-Challenge",
646
- "score":0.3389
647
- },
648
- ]
649
- }
650
- ],
651
- "total_num":100
652
- }
653
- """
654
- results = {k: normalize_score(score=v) for k, v in results.items()}
655
-
656
- category_d = dict(name='DEFAULT',
657
- score=results,
658
- subset=[])
659
-
660
- res_map = dict(name='HumanEval',
661
- metric='pass@k',
662
- score=results,
663
- category=[category_d],
664
- total_num=len(self.problems))
665
-
666
- return res_map
667
-
668
- @classmethod
669
- def _postprocess(cls, text: str) -> str:
670
- if '```' in text:
671
- blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
672
- if len(blocks) == 0:
673
- text = text.split('```')[1] # fall back to default strategy
674
- else:
675
- text = blocks[0] # fetch the first code block
676
- if not text.startswith('\n'): # in case starting with ```python
677
- text = text[max(text.find('\n') + 1, 0):]
678
- if text.strip().startswith('from') or text.strip().startswith('import'):
679
- def_idx = text.find('def')
680
- if def_idx != -1:
681
- text = text[max(text.find('\n', def_idx) + 1, 0):]
682
- text = text.split('\n\n')[0]
683
- if text.strip().startswith('def'):
684
- text = '\n'.join(text.split('\n')[1:])
685
- if not text.startswith(' '):
686
- if text.startswith(' '):
687
- text = ' ' + text.lstrip()
688
- else:
689
- text = '\n'.join([' ' + line for line in text.split('\n')])
690
- return text
409
+ return report_map