evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,27 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ import json
3
4
  import os
4
5
  import time
5
- import json
6
- import re
7
- from copy import deepcopy
8
6
  from collections import OrderedDict
9
-
7
+ from copy import deepcopy
10
8
  from tqdm import tqdm
11
- from typing import Optional, List, Any, Union, Dict
9
+ from typing import Any, Dict, List, Optional, Union
12
10
 
13
11
  from evalscope.benchmarks import DataAdapter
14
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR, OutputsStructure, AnswerKeys, ReviewKeys, EvalStage
12
+ from evalscope.config import TaskConfig
13
+ from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
14
+ ReviewKeys)
15
15
  from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
16
16
  from evalscope.tools.combine_reports import gen_table
17
- from evalscope.utils import gen_hash, dict_torch_dtype_to_str, dump_jsonl_data, process_outputs_structure, \
18
- normalize_score, dict_to_yaml, jsonl_to_list
17
+ from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
19
19
  from evalscope.utils.logger import get_logger
20
20
 
21
21
  logger = get_logger()
22
22
 
23
23
 
24
24
  class Evaluator(object):
25
-
26
25
  """
27
26
  The evaluator for model on datasets.
28
27
 
@@ -33,11 +32,8 @@ class Evaluator(object):
33
32
  data_adapter: DataAdapter, the data adapter for the dataset.
34
33
  subset_list: list, the subset list for the dataset.
35
34
  model_adapter: BaseModelAdapter, the model adapter for the model.
36
- use_cache: bool, whether to use local cache. Default: True
37
- mem_cache_method: str, the memory cache method. Default: 'ttl' (deprecated)
38
- root_cache_dir: str, the root cache dir. Default: DEFAULT_ROOT_CACHE_DIR
39
- outputs_dir: str, the outputs dir. Default: ''
40
- is_custom_outputs_dir: bool, whether to use custom outputs dir. Default: False (deprecated)
35
+ use_cache: str, path to local cache. Default: None
36
+ outputs_dir: OutputsStructure, the outputs dir. Default: None
41
37
  datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
42
38
  datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
43
39
  stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
@@ -51,24 +47,20 @@ class Evaluator(object):
51
47
  data_adapter: DataAdapter,
52
48
  subset_list: Optional[list] = None,
53
49
  model_adapter: Optional[BaseModelAdapter] = None,
54
- use_cache: bool = True,
55
- mem_cache_method: str = 'ttl',
56
- root_cache_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
57
- outputs_dir: Optional[str] = '',
58
- is_custom_outputs_dir: bool = False,
59
- datasets_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
60
- datasets_hub: Optional[str] = 'ModelScope',
61
- stage: Optional[str] = 'all', # refer to evalscope.constants.EvalStage
62
- eval_type: Optional[str] = 'checkpoint', # `checkpoint` or `service` or `custom`
63
- overall_task_cfg: Optional[dict] = None,
50
+ use_cache: Optional[str] = None,
51
+ outputs: Optional[OutputsStructure] = None,
52
+ datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
53
+ datasets_hub: Optional[str] = HubType.MODELSCOPE,
54
+ stage: Optional[str] = EvalStage.ALL,
55
+ eval_type: Optional[str] = EvalType.CHECKPOINT,
56
+ overall_task_cfg: Optional[TaskConfig] = None,
64
57
  **kwargs):
65
58
 
66
59
  self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
67
- self.custom_task_name: str = None
68
- if os.path.exists(self.dataset_name_or_path):
69
- self.custom_task_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
60
+ self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
61
+ self.model_name = overall_task_cfg.model_id
62
+ self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
70
63
 
71
- self.root_cache_dir = os.path.expanduser(root_cache_dir)
72
64
  self.datasets_dir = os.path.expanduser(datasets_dir)
73
65
  self.kwargs = kwargs
74
66
  self.data_adapter = data_adapter
@@ -78,70 +70,31 @@ class Evaluator(object):
78
70
  self.use_cache = use_cache
79
71
  self.overall_task_cfg = overall_task_cfg
80
72
  if isinstance(self.model_adapter, CustomModelAdapter):
81
- self.overall_task_cfg.update({'custom_config': self.model_adapter.custom_model.config})
73
+ self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
82
74
 
83
75
  self.model_cfg = self.model_adapter.model_cfg
84
- self.model_id = self.model_cfg['model_id']
85
- self.model_revision = self.model_cfg.get('revision', None)
86
- self.model_revision_str = self.model_revision if self.model_revision is not None else 'none'
87
-
88
- # Get default outputs_dir
89
- # TODO: refactor outputs_dir, del timestamp concat
90
- # if not is_custom_outputs_dir:
91
- # outputs_dir = make_outputs_dir(work_dir=outputs_dir,
92
- # model_id=self.model_id,
93
- # model_revision=self.model_revision_str)
94
-
95
- self.outputs_dir = os.path.expanduser(outputs_dir)
96
76
 
97
77
  # Deal with the output paths
98
- self.outputs_structure = process_outputs_structure(self.outputs_dir)
78
+ self.outputs_structure = outputs
99
79
 
100
80
  # Load dataset
101
- self.dataset = self.data_adapter.load(dataset_name_or_path=dataset_name_or_path,
102
- subset_list=subset_list,
103
- work_dir=self.datasets_dir,
104
- datasets_hub=datasets_hub,
105
- **kwargs)
81
+ self.dataset = self.data_adapter.load(
82
+ dataset_name_or_path=dataset_name_or_path,
83
+ subset_list=subset_list,
84
+ work_dir=self.datasets_dir,
85
+ datasets_hub=datasets_hub,
86
+ **kwargs)
106
87
 
107
88
  # Get prompts from dataset
108
89
  self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
109
90
  del self.dataset
110
91
 
111
- # Init memory cache
112
- # TODO: refactor mem cache manager
113
- # mem_cache_file_name = self.dataset_name_or_path.replace('/', '_') + \
114
- # '_' + self.model_id.replace('/', '_') + \
115
- # '_' + self.model_revision_str + \
116
- # '_cache.pkl'
117
- # self.mem_cache_path = os.path.join(self.root_cache_dir, 'mem_cache', mem_cache_file_name)
118
-
119
- # Note: mem_cache is deprecated, use `use_cache` instead
120
- self.mem_cache = None
121
- self.mem_cache_method = mem_cache_method
122
- # if self.use_cache:
123
- # self.mem_cache = init_mem_cache(method=self.mem_cache_method, cache_file_path=self.mem_cache_path)
124
- # logger.info(f'** Using memory cache with size: {len(self.mem_cache)}')
125
-
126
- def _pred_answer(self,
127
- input_d: dict,
128
- infer_cfg: dict,
129
- subset_name: str,
130
- answer_id: str = None) -> dict:
131
-
132
- # Get answer from memory cache
133
- if self.mem_cache is not None:
134
- if answer_id in self.mem_cache:
135
- logger.info(f'** Reusing answer `{answer_id}` in memory cache.')
136
- return self.mem_cache[answer_id]
92
+ def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
137
93
 
138
94
  ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
139
95
  ans[AnswerKeys.ANSWER_ID] = answer_id
140
96
  ans[AnswerKeys.SUBSET_NAME] = subset_name
141
97
 
142
- if self.mem_cache is not None:
143
- self.mem_cache[answer_id] = ans
144
-
145
98
  return ans
146
99
 
147
100
  def get_answers(self,
@@ -177,26 +130,21 @@ class Evaluator(object):
177
130
  assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
178
131
 
179
132
  answers_list = []
180
- pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
181
-
182
- if self.custom_task_name:
183
- pred_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
184
- else:
185
- pred_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
186
-
187
- pred_file_path: str = os.path.join(pred_dir, pred_file_name)
133
+ pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
134
+ pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
135
+ os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
188
136
 
189
137
  if self.use_cache and os.path.exists(pred_file_path):
190
138
  answers_list = jsonl_to_list(pred_file_path)
191
- logger.info(f'** Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
192
-
193
- return answers_list
139
+ logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
140
+ # Note: assume prediction in order of prompts_list
141
+ prompts_list = prompts_list[len(answers_list):]
194
142
 
195
143
  if isinstance(self.model_adapter, CustomModelAdapter):
196
144
  # Batch inference for custom model
197
145
 
198
- resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(inputs=prompts_list,
199
- infer_cfg=infer_cfg)
146
+ resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
147
+ inputs=prompts_list, infer_cfg=infer_cfg)
200
148
 
201
149
  assert len(prompts_list) == len(resp_answers_list), \
202
150
  f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
@@ -207,10 +155,10 @@ class Evaluator(object):
207
155
  model_cfg_str = json.dumps(
208
156
  OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
209
157
  ensure_ascii=False)
210
- input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())),
211
- ensure_ascii=False)
212
- infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())),
213
- ensure_ascii=False)
158
+ input_prompt_str = json.dumps(
159
+ OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
160
+ infer_cfg_str = json.dumps(
161
+ OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
214
162
  answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
215
163
 
216
164
  resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
@@ -220,6 +168,7 @@ class Evaluator(object):
220
168
  resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
221
169
 
222
170
  answers_list.append(resp_d)
171
+ dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
223
172
 
224
173
  else:
225
174
  for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
@@ -228,17 +177,15 @@ class Evaluator(object):
228
177
  model_cfg_str = json.dumps(
229
178
  OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
230
179
  ensure_ascii=False)
231
- input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())),
232
- ensure_ascii=False)
233
- infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())),
234
- ensure_ascii=False)
180
+ input_prompt_str = json.dumps(
181
+ OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
182
+ infer_cfg_str = json.dumps(
183
+ OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
235
184
  answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
236
185
 
237
186
  # Get answers
238
- answer_d: dict = self._pred_answer(input_d=input_prompt,
239
- infer_cfg=infer_cfg,
240
- subset_name=subset_name,
241
- answer_id=answer_id)
187
+ answer_d: dict = self._pred_answer(
188
+ input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
242
189
 
243
190
  answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
244
191
  answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
@@ -249,26 +196,12 @@ class Evaluator(object):
249
196
  logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
250
197
 
251
198
  answers_list.append(answer_d)
199
+ dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
252
200
 
253
- if len(answers_list) == 0:
254
- logger.error(f'** Got empty predictions on subset {subset_name} of dataset: {self.dataset_name_or_path}')
255
-
256
- # Dump answers
257
- os.makedirs(pred_dir, exist_ok=True)
258
- dump_jsonl_data(answers_list, pred_file_path)
259
-
201
+ logger.info(f'Dump predictions to {pred_file_path}.')
260
202
  return answers_list
261
203
 
262
- def _get_review(self,
263
- answer_d: dict,
264
- review_id: str = None,
265
- reviewer_spec: dict = None) -> dict:
266
-
267
- # Get review from memory cache
268
- if self.mem_cache is not None:
269
- if review_id in self.mem_cache:
270
- logger.info(f'** Reusing review `{review_id}` in memory cache.')
271
- return self.mem_cache[review_id]
204
+ def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
272
205
 
273
206
  if reviewer_spec is None:
274
207
  reviewer_spec = {}
@@ -286,15 +219,16 @@ class Evaluator(object):
286
219
  for choice in choices:
287
220
  raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
288
221
  answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
289
- answer_content = self.data_adapter.parse_pred_result(result=answer_content,
290
- raw_input_d=raw_input_d,
291
- eval_type=self.eval_type)
222
+ answer_content = self.data_adapter.parse_pred_result(
223
+ result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
292
224
  gold_content = self.data_adapter.get_gold_answer(raw_input_d)
293
225
 
294
226
  review_result = self.data_adapter.match(gold_content, answer_content)
295
- choice[ReviewKeys.REVIEW] = {ReviewKeys.GOLD: gold_content,
296
- ReviewKeys.PRED: answer_content,
297
- ReviewKeys.RESULT: review_result}
227
+ choice[ReviewKeys.REVIEW] = {
228
+ ReviewKeys.GOLD: gold_content,
229
+ ReviewKeys.PRED: answer_content,
230
+ ReviewKeys.RESULT: review_result
231
+ }
298
232
 
299
233
  rev_choices.append(choice)
300
234
 
@@ -304,9 +238,6 @@ class Evaluator(object):
304
238
  review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
305
239
  review_res[ReviewKeys.REVIEW_TIME] = time.time()
306
240
 
307
- if self.mem_cache is not None:
308
- self.mem_cache[review_id] = review_res
309
-
310
241
  return review_res
311
242
 
312
243
  def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
@@ -324,26 +255,25 @@ class Evaluator(object):
324
255
  """
325
256
  reviews_list = []
326
257
 
327
- review_dir: str = self.outputs_structure.get(OutputsStructure.REVIEWS_DIR)
328
- if self.custom_task_name:
329
- review_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
330
- else:
331
- review_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
332
- review_file_path: str = os.path.join(review_dir, review_file_name)
258
+ review_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
259
+ review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
260
+ os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
333
261
 
334
262
  if self.use_cache and os.path.exists(review_file_path):
335
- logger.warning(f'** Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
263
+ logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
336
264
 
337
265
  for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
338
266
 
339
267
  # Gen review_id (concat: answer_id + reviewer_spec)
340
268
  answer_id = answer_d[AnswerKeys.ANSWER_ID]
341
269
 
342
- reviewer_spec: dict = {'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
343
- 'reviewer': ['Evaluator'],
344
- 'revision': ['default']}
345
- reviewer_spec_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())),
346
- ensure_ascii=False)
270
+ reviewer_spec: dict = {
271
+ 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
272
+ 'reviewer': ['Evaluator'],
273
+ 'revision': ['default']
274
+ }
275
+ reviewer_spec_str = json.dumps(
276
+ OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
347
277
  review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
348
278
 
349
279
  # Get review
@@ -354,9 +284,8 @@ class Evaluator(object):
354
284
 
355
285
  reviews_list.append(review_d)
356
286
 
357
- # Dump reviews
358
- os.makedirs(review_dir, exist_ok=True)
359
- dump_jsonl_data(reviews_list, review_file_path)
287
+ # Dump reviews
288
+ dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
360
289
 
361
290
  return reviews_list
362
291
 
@@ -375,7 +304,7 @@ class Evaluator(object):
375
304
  review_res_list = []
376
305
  for review_d in reviews_list:
377
306
  if not review_d[ReviewKeys.REVIEWED]:
378
- logger.warning(f'** Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
307
+ logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
379
308
  continue
380
309
 
381
310
  review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
@@ -385,7 +314,7 @@ class Evaluator(object):
385
314
 
386
315
  return metric_score
387
316
 
388
- def dump_report(self, report_map: dict, use_table: bool = True):
317
+ def dump_report(self, reviews_score_all: dict, use_table: bool = True):
389
318
  """
390
319
  Get report for total reviews of specific dataset.
391
320
  It is required to rewrite this method to support your own evaluator.
@@ -396,50 +325,31 @@ class Evaluator(object):
396
325
 
397
326
  Returns: None
398
327
  """
328
+ # Get report map
329
+ report_map: dict = self.data_adapter.gen_report(
330
+ subset_score_map=reviews_score_all, report_name=self.custom_task_name)
331
+ report_map.update(dict(model_name=self.model_name, dataset_name=self.dataset_name))
399
332
 
400
333
  # Dump report
401
- report_dir: str = self.outputs_structure[OutputsStructure.REPORTS_DIR]
402
-
403
- if self.custom_task_name:
404
- report_file_name: str = self.custom_task_name + '.json'
405
- else:
406
- report_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '.json'
334
+ report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
335
+ self.dataset_name + '.json')
336
+ os.makedirs(os.path.dirname(report_path), exist_ok=True)
407
337
 
408
- os.makedirs(report_dir, exist_ok=True)
409
- report_path: str = os.path.join(report_dir, report_file_name)
338
+ # Write report
410
339
  with open(report_path, 'w') as f:
411
340
  f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
412
- # logger.info(f'** Dump report to {report_path} \n')
413
- logger.info(f'** Dump report: {report_file_name} \n')
341
+ logger.info(f'Dump report: {report_path} \n')
414
342
 
343
+ # Make table
415
344
  if use_table:
416
345
  try:
417
- # Make table
418
- report_table: str = gen_table([report_dir])
419
- logger.info(f'** Report table: \n {report_table} \n')
420
- except:
346
+ report_table: str = gen_table([self.outputs_structure.reports_dir])
347
+ logger.info(f'Report table: \n{report_table} \n')
348
+ except Exception:
421
349
  logger.error('Failed to generate report table.')
350
+ return report_map
422
351
 
423
- # def save_cache(self):
424
- # if self.mem_cache is not None:
425
- # logger.info(f'** Saving memory cache with size: {len(self.mem_cache)}')
426
- # Cache.save(cache=self.mem_cache, path=self.mem_cache_path)
427
-
428
- # def clear_cache(self):
429
- # """
430
- # Clear memory cache.
431
- #
432
- # Returns: None
433
- # """
434
- # if self.mem_cache is not None:
435
- # cache_len = len(self.mem_cache)
436
- # self.mem_cache.clear()
437
- # logger.info(f'** Memory cache cleared, length changed: {cache_len} -> {len(self.mem_cache)}')
438
-
439
- def eval(self,
440
- infer_cfg: dict = None,
441
- debug: bool = False,
442
- **kwargs) -> dict:
352
+ def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
443
353
  """
444
354
  Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
445
355
  It is required to rewrite this method to support your own evaluator.
@@ -465,27 +375,22 @@ class Evaluator(object):
465
375
 
466
376
  logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
467
377
 
468
- reviews_score_all = {} # {subset_name: (score, num)}
378
+ reviews_score_all = {} # {subset_name: (score, num)}
469
379
  stage_answers_dict = {}
470
380
  stage_reviews_dict = {}
471
381
 
472
382
  for subset_name, prompts_list in self.prompts.items():
473
- limit = infer_cfg.get('limit', len(prompts_list))
383
+ limit = kwargs.get('limit', len(prompts_list))
474
384
  prompts_list = prompts_list[:limit]
475
385
 
476
- answers_list: list = self.get_answers(subset_name=subset_name,
477
- prompts_list=prompts_list,
478
- infer_cfg=infer_cfg,
479
- debug=debug,
480
- **kwargs)
386
+ answers_list: list = self.get_answers(
387
+ subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
481
388
  if self.stage == EvalStage.INFER:
482
389
  stage_answers_dict[subset_name] = answers_list
483
390
  continue
484
391
 
485
- reviews_list: list = self.get_reviews(subset_name=subset_name,
486
- answers_list=answers_list,
487
- debug=debug,
488
- **kwargs)
392
+ reviews_list: list = self.get_reviews(
393
+ subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
489
394
 
490
395
  metric_res = self.compute_metrics(reviews_list=reviews_list)
491
396
  reviews_score_all[subset_name] = (metric_res, len(reviews_list))
@@ -498,193 +403,8 @@ class Evaluator(object):
498
403
  return stage_reviews_dict
499
404
 
500
405
  # Generate report
501
- report_map: dict = self.data_adapter.gen_report(subset_score_map=reviews_score_all,
502
- report_name=self.custom_task_name)
503
- self.dump_report(report_map=report_map)
504
-
505
- # Dump overall task config
506
- overall_task_cfg_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.CONFIGS_DIR),
507
- 'task_output_config.yaml')
508
- overall_task_cfg_file = os.path.abspath(overall_task_cfg_file)
509
-
510
- # TODO: check the robustness of dump yaml
511
- try:
512
- logger.info(f'** Dump overall task config to {overall_task_cfg_file}')
513
- logger.info(f'** The overall task config:\n {self.overall_task_cfg}')
514
- if 'model' in self.overall_task_cfg and not isinstance(self.overall_task_cfg['model'], str):
515
- self.overall_task_cfg['model'] = None
516
- logger.info(f'>> Overwrite overall_task_cfg for `model` due to it is not a string')
517
- if 'model_args' in self.overall_task_cfg and self.overall_task_cfg.get('model_args') is not None:
518
- self.overall_task_cfg['model_args'].update({'precision': str(self.overall_task_cfg['model_args']['precision'])})
519
- logger.info(f'>> Overwrite overall_task_cfg for `model_args.precision` due to it is not a string')
520
-
521
- dict_to_yaml(self.overall_task_cfg, overall_task_cfg_file)
522
- except Exception as e:
523
- logger.warning(f'Failed to dump overall task config: {e}')
524
-
525
- # Note: deprecated
526
- # self.save_cache()
527
- # self.clear_cache()
528
-
529
- logger.info(f'\n**** Evaluation finished on {self.dataset_name_or_path} ****\n')
530
-
531
- return report_map
532
-
533
-
534
- class HumanevalEvaluator(object):
406
+ report_map = self.dump_report(reviews_score_all)
535
407
 
536
- def __init__(self,
537
- problem_file: str,
538
- model_id: str,
539
- model_revision: str,
540
- model_adapter: BaseModelAdapter,
541
- outputs_dir: Optional[str] = '',
542
- is_custom_outputs_dir: bool = False,
543
- k: List[int] = [1, 10, 100],
544
- n_workers: int = 4,
545
- timeout: float = 3.0,):
546
- try:
547
- from human_eval.evaluation import evaluate_functional_correctness
548
- from human_eval.data import read_problems, write_jsonl
549
- except ImportError:
550
- raise ImportError('Please install human_eval:'
551
- 'https://github.com/openai/human-eval/tree/master#installation , '
552
- 'Note that you need to enable the execution code in the human_eval/execution.py first.')
553
-
554
- self.problem_file = problem_file
555
- self.k = k
556
- self.num_workers = n_workers
557
- self.timeout = timeout
558
- self.model_adapter = model_adapter
559
-
560
- self.read_problems_func = read_problems
561
- self.write_jsonl_func = write_jsonl
562
- self.eval_func = evaluate_functional_correctness
563
-
564
- # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
565
- self.problems = self.read_problems_func(self.problem_file)
566
-
567
- # Get default outputs_dir
568
- model_revision_str: str = model_revision if model_revision is not None else 'none'
569
- # if not is_custom_outputs_dir:
570
- # outputs_dir = make_outputs_dir(work_dir=outputs_dir,
571
- # model_id=model_id,
572
- # model_revision=model_revision_str)
573
- self.outputs_dir = os.path.expanduser(outputs_dir)
574
-
575
- # Deal with the output paths
576
- self.outputs_structure = process_outputs_structure(self.outputs_dir)
577
-
578
- def get_answers(self, infer_cfg: dict) -> List[dict]:
579
- ans_list: list = []
580
- system_prompt: str = 'Complete the following python code:\n'
581
- for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
582
- prompt: str = system_prompt + data_d['prompt']
583
- inputs: dict = {'data': [prompt]}
584
- # pred_res: dict = self.model_adapter.predict(inputs)
585
-
586
- pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
587
-
588
- pred_ans: str = pred_res['choices'][0]['message']['content']
589
- pred_ans = self._postprocess(pred_ans)
590
-
591
- ans_list.append({'task_id': task_id, 'completion': pred_ans})
592
-
593
- return ans_list
594
-
595
- def eval(self, infer_cfg: dict, **kwargs):
408
+ logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
596
409
 
597
- # predict
598
- ans_list: list = self.get_answers(infer_cfg)
599
- ans_out_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR),
600
- 'human_eval_predictions.jsonl')
601
-
602
- self.write_jsonl_func(filename=ans_out_file, data=ans_list)
603
- # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
604
- logger.info('** Dump predictions successfully.')
605
-
606
- # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
607
- results = self.eval_func(sample_file=ans_out_file,
608
- k=self.k,
609
- n_workers=self.num_workers,
610
- timeout=self.timeout,
611
- problem_file=self.problem_file)
612
-
613
- # output: report
614
- report_map: dict = self.gen_report(results=results)
615
- report_dir: str = self.outputs_structure.get(OutputsStructure.REPORTS_DIR)
616
- report_file: str = os.path.join(report_dir, 'human_eval_report.json')
617
-
618
- with open(report_file, 'w') as f:
619
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
620
- # logger.info(f'** Dump report to {report_file} \n')
621
- logger.info(f'** Dump report \n')
622
-
623
- try:
624
- # Make table
625
- report_table: str = gen_table([report_dir])
626
- logger.info(f'** Report table: \n {report_table} \n')
627
- except:
628
- logger.error('Failed to generate report table.')
629
-
630
- def gen_report(self, results: dict) -> dict:
631
- """
632
- Generate report from evaluation results.
633
-
634
- Returns:
635
- {
636
- "name":"ARC-Challenge",
637
- "metric":"WeightedAverageAccuracy",
638
- "score":0.3389,
639
- "category":[
640
- {
641
- "name":"DEFAULT",
642
- "score":0.3389,
643
- "subset":[
644
- {
645
- "name":"ARC-Challenge",
646
- "score":0.3389
647
- },
648
- ]
649
- }
650
- ],
651
- "total_num":100
652
- }
653
- """
654
- results = {k: normalize_score(score=v) for k, v in results.items()}
655
-
656
- category_d = dict(name='DEFAULT',
657
- score=results,
658
- subset=[])
659
-
660
- res_map = dict(name='HumanEval',
661
- metric='pass@k',
662
- score=results,
663
- category=[category_d],
664
- total_num=len(self.problems))
665
-
666
- return res_map
667
-
668
- @classmethod
669
- def _postprocess(cls, text: str) -> str:
670
- if '```' in text:
671
- blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
672
- if len(blocks) == 0:
673
- text = text.split('```')[1] # fall back to default strategy
674
- else:
675
- text = blocks[0] # fetch the first code block
676
- if not text.startswith('\n'): # in case starting with ```python
677
- text = text[max(text.find('\n') + 1, 0):]
678
- if text.strip().startswith('from') or text.strip().startswith('import'):
679
- def_idx = text.find('def')
680
- if def_idx != -1:
681
- text = text[max(text.find('\n', def_idx) + 1, 0):]
682
- text = text.split('\n\n')[0]
683
- if text.strip().startswith('def'):
684
- text = '\n'.join(text.split('\n')[1:])
685
- if not text.startswith(' '):
686
- if text.startswith(' '):
687
- text = ' ' + text.lstrip()
688
- else:
689
- text = '\n'.join([' ' + line for line in text.split('\n')])
690
- return text
410
+ return report_map