evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (78) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +67 -59
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +12 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  7. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  8. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  9. evalscope/backend/rag_eval/utils/embedding.py +75 -35
  10. evalscope/backend/rag_eval/utils/llm.py +1 -1
  11. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  12. evalscope/benchmarks/benchmark.py +1 -0
  13. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  14. evalscope/benchmarks/data_adapter.py +101 -18
  15. evalscope/benchmarks/docmath/__init__.py +0 -0
  16. evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  17. evalscope/benchmarks/docmath/utils.py +220 -0
  18. evalscope/benchmarks/drop/__init__.py +0 -0
  19. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  20. evalscope/benchmarks/drop/utils.py +59 -0
  21. evalscope/benchmarks/frames/__init__.py +0 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +90 -0
  23. evalscope/benchmarks/frames/utils.py +37 -0
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  25. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  27. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  28. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  29. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  30. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
  31. evalscope/benchmarks/tool_bench/utils.py +203 -0
  32. evalscope/benchmarks/utils.py +28 -2
  33. evalscope/benchmarks/winogrande/__init__.py +0 -0
  34. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  35. evalscope/cli/start_app.py +2 -2
  36. evalscope/collections/__init__.py +35 -3
  37. evalscope/collections/evaluator.py +94 -32
  38. evalscope/config.py +54 -17
  39. evalscope/evaluator/evaluator.py +80 -41
  40. evalscope/metrics/__init__.py +3 -1
  41. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  42. evalscope/metrics/llm_judge.py +15 -8
  43. evalscope/metrics/math_parser.py +1 -1
  44. evalscope/metrics/rouge_metric.py +11 -13
  45. evalscope/models/adapters/chat_adapter.py +51 -34
  46. evalscope/models/adapters/server_adapter.py +17 -25
  47. evalscope/perf/arguments.py +16 -7
  48. evalscope/perf/benchmark.py +0 -15
  49. evalscope/perf/main.py +72 -15
  50. evalscope/perf/plugin/datasets/custom.py +15 -0
  51. evalscope/perf/utils/benchmark_util.py +34 -16
  52. evalscope/perf/utils/db_util.py +25 -15
  53. evalscope/perf/utils/local_server.py +1 -0
  54. evalscope/perf/utils/log_utils.py +12 -5
  55. evalscope/perf/utils/rich_display.py +186 -0
  56. evalscope/report/__init__.py +36 -4
  57. evalscope/report/combinator.py +8 -0
  58. evalscope/report/generator.py +33 -9
  59. evalscope/report/utils.py +61 -4
  60. evalscope/run.py +12 -0
  61. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  62. evalscope/utils/deprecation_utils.py +42 -0
  63. evalscope/utils/logger.py +1 -1
  64. evalscope/utils/utils.py +12 -0
  65. evalscope/version.py +2 -2
  66. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
  67. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
  68. tests/aigc/test_t2i.py +40 -3
  69. tests/cli/test_all.py +39 -32
  70. tests/cli/test_collection.py +8 -6
  71. tests/cli/test_run.py +43 -17
  72. tests/perf/test_perf.py +23 -0
  73. tests/rag/test_mteb.py +5 -5
  74. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  75. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
  76. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
  77. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
  78. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,70 @@
1
+ from typing import Dict, List
2
+
3
+ from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from evalscope.constants import EvalType, OutputType
5
+ from evalscope.metrics import Metric, mean, metric_registry
6
+
7
+
8
+ @Benchmark.register(
9
+ name='tool_bench',
10
+ pretty_name='ToolBench-Static',
11
+ dataset_id='AI-ModelScope/ToolBench-Static',
12
+ subset_list=['in_domain', 'out_of_domain'],
13
+ metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
14
+ few_shot_num=0,
15
+ train_split=None,
16
+ eval_split='test',
17
+ )
18
+ class ToolBenchAdapter(DataAdapter):
19
+
20
+ def __init__(self, **kwargs):
21
+ super().__init__(**kwargs)
22
+
23
+ metric_registry.register(Metric(name='Rouge-L', object=mean))
24
+ metric_registry.register(Metric(name='Act.EM', object=mean))
25
+ metric_registry.register(Metric(name='Plan.EM', object=mean))
26
+ metric_registry.register(Metric(name='F1', object=mean))
27
+ metric_registry.register(Metric(name='HalluRate', object=mean))
28
+
29
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
30
+ """
31
+ Generate model prompt from input data.
32
+ """
33
+ messages = input_d['messages']
34
+ # use prepared messages and remove the name field
35
+ for message in messages:
36
+ if 'name' in message:
37
+ del message['name']
38
+ return self.gen_prompt_data(prompt='', messages=messages)
39
+
40
+ def get_gold_answer(self, input_d: dict) -> str:
41
+ """
42
+ Parse the raw input labels (gold).
43
+ """
44
+ return input_d
45
+
46
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
47
+ """
48
+ Parse the predicted result and extract proper answer.
49
+ """
50
+ return result
51
+
52
+ def match(self, gold: dict, pred: str) -> Dict:
53
+ """
54
+ Match the gold answer and the predicted answer.
55
+ """
56
+ from .utils import calculate_metrics
57
+
58
+ data = {
59
+ 'target': gold['target'],
60
+ 'predictions': pred,
61
+ 'tools': gold['tools'],
62
+ }
63
+ metrics = calculate_metrics(data)
64
+ return metrics
65
+
66
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
67
+ # aggregate review results
68
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
69
+
70
+ return super().compute_metric(res_dict, **kwargs)
@@ -0,0 +1,203 @@
1
+ import json
2
+
3
+ from evalscope.metrics import compute_rouge_score_one_sample
4
+
5
+
6
+ def evaluate_rougel(cand_list: list, ref_list: list):
7
+ if len(ref_list) == 0:
8
+ return 0
9
+ rouge_score = compute_rouge_score_one_sample(cand_list, ref_list)
10
+ rougel = rouge_score.get('rouge-l-f', 0)
11
+
12
+ return rougel
13
+
14
+
15
+ def evaluate_action_em(cand_list: list, ref_list: list):
16
+ if len(ref_list) == 0:
17
+ return 0
18
+ em = 0
19
+ for cand, ref in zip(cand_list, ref_list):
20
+ em += (1 if cand == ref else 0)
21
+ return em / len(cand_list)
22
+
23
+
24
+ def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list):
25
+ easy_f1 = []
26
+ hard_f1 = []
27
+ f1 = []
28
+ for i in range(len(action_pred)):
29
+ ref_action = action_ref[i]
30
+ pred_action = action_pred[i]
31
+
32
+ ref_input = ref_list[i]
33
+ cand_input = cand_list[i]
34
+
35
+ if ref_action != pred_action:
36
+ easy_f1.append(0)
37
+ hard_f1.append(0)
38
+ f1.append(0)
39
+ else:
40
+ try:
41
+ ref_input_json = json.loads(ref_input)
42
+ try:
43
+ cand_input_json = json.loads(cand_input)
44
+ half_match = 0
45
+ full_match = 0
46
+ if ref_input_json == {}:
47
+ if cand_input_json == {}:
48
+ easy_f1.append(1)
49
+ f1.append(1)
50
+ else:
51
+ easy_f1.append(0)
52
+ f1.append(0)
53
+ else:
54
+ for k, v in ref_input_json.items():
55
+ if k in cand_input_json.keys():
56
+ if cand_input_json[k] == v:
57
+ full_match += 1
58
+ else:
59
+ half_match += 1
60
+
61
+ recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30)
62
+ precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30)
63
+ hard_f1.append((2 * recall * precision) / (recall + precision))
64
+ f1.append((2 * recall * precision) / (recall + precision))
65
+ except Exception:
66
+ # cand_input = cand_input.replace("\n","").replace("\"","")
67
+ # ref_input = cand_input.replace("\n","").replace("\"","")
68
+ # rouge = Rouge()
69
+ # rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True)
70
+ if ref_input_json == {}:
71
+ easy_f1.append(0)
72
+ else:
73
+ hard_f1.append(0)
74
+ # hard_f1.append(rouge_score["rouge-l"]["f"])
75
+ # f1.append(rouge_score["rouge-l"]["f"])
76
+ f1.append(0)
77
+ except Exception:
78
+ pass
79
+
80
+ # 检查列表是否为空,如果为空则返回0
81
+ easy_f1_avg = sum(easy_f1) / len(easy_f1) if easy_f1 else 0
82
+ hard_f1_avg = sum(hard_f1) / len(hard_f1) if hard_f1 else 0
83
+ f1_avg = sum(f1) / len(f1) if f1 else 0
84
+
85
+ return easy_f1_avg, hard_f1_avg, f1_avg
86
+
87
+
88
+ def parse_action(text):
89
+ action = 'None'
90
+ action_input = '{}'
91
+ if 'Action Input:' in text:
92
+ input_idx = text.rindex('Action Input:')
93
+ action_input = text[input_idx + len('Action Input:'):].strip()
94
+ else:
95
+ action_input = '{}'
96
+
97
+ if 'Action:' in text:
98
+ action_idx = text.rindex('Action:')
99
+ action = text[action_idx + len('Action:'):].strip()
100
+ if 'Action Input:' in action:
101
+ input_idx = action.index('Action Input:')
102
+ action = action[:input_idx].strip()
103
+ else:
104
+ action = 'none'
105
+ return action, action_input
106
+
107
+
108
+ def parse_output(text):
109
+ action, action_input = parse_action(text)
110
+ if action == 'Finish':
111
+ try:
112
+ action_input = json.loads(action_input)
113
+ # print(action_input)
114
+ # print(json.dumps(action_input,indent=2))
115
+ return_type = action_input['return_type']
116
+ if return_type == 'give_answer':
117
+ if 'final_answer' in action_input.keys():
118
+ answer = str(action_input['final_answer'])
119
+ if answer.strip() in ['', '.', ',']:
120
+ answer = 'None'
121
+ else:
122
+ answer = 'None'
123
+ return 'finish', action, action_input, answer
124
+ else:
125
+ return 'give up', None, None, None
126
+ except Exception:
127
+ return 'give up', None, None, None
128
+ else:
129
+ plan = 'call'
130
+ answer = None
131
+ return plan, action, action_input, answer
132
+
133
+
134
+ def calculate_metrics(data):
135
+ """
136
+ Calculate the metrics for the given data.
137
+ """
138
+ plan_ref = []
139
+ plan_pred = []
140
+ hallu_cases = []
141
+ answer_ref = []
142
+ action_ref = []
143
+ action_input_ref = []
144
+ answer_pred = []
145
+ action_pred = []
146
+ action_input_pred = []
147
+ hallu_pred = 0
148
+
149
+ reference = data['target']
150
+ prediction = data['predictions']
151
+ ref_plan, ref_action, ref_input, ref_ans = parse_output(reference)
152
+ # ref_plan: call
153
+ # ref_action: spott
154
+ # ref_input: {"is_id": "city center" }
155
+ # ref_ans: None
156
+
157
+ pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
158
+ if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
159
+ return {}
160
+ if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in data['tools']]:
161
+ return {}
162
+
163
+ if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in data['tools']]:
164
+ hallu_pred += 1
165
+ hallu_cases.append(data)
166
+
167
+ plan_ref.append(ref_plan)
168
+ plan_pred.append(pred_plan)
169
+ if ref_plan == 'give up':
170
+ pass
171
+ elif ref_plan == 'finish':
172
+ answer_ref.append(ref_ans)
173
+ if pred_ans is None:
174
+ answer_pred.append('none')
175
+ else:
176
+ answer_pred.append(pred_ans)
177
+ else:
178
+ action_ref.append(ref_action)
179
+ action_input_ref.append(ref_input)
180
+ if pred_action is None:
181
+ action_pred.append('none')
182
+ else:
183
+ action_pred.append(pred_action)
184
+
185
+ if pred_input is None:
186
+ action_input_pred.append('{}')
187
+ else:
188
+ action_input_pred.append(pred_input)
189
+
190
+ metric = {}
191
+ rouge = evaluate_rougel(answer_pred, answer_ref)
192
+ plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref)
193
+ action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref)
194
+ easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref)
195
+ hallu_rate = hallu_pred
196
+ metric['Act.EM'] = action_em
197
+ metric['F1'] = f1
198
+ metric['HalluRate'] = hallu_rate
199
+ metric['plan_em'] = plan_em
200
+ metric['Easy_F1'] = easy_f1
201
+ metric['Hard_F1'] = hard_f1
202
+ metric['Rouge-L'] = rouge
203
+ return metric
@@ -13,6 +13,7 @@ class PromptData:
13
13
  system_prompt: Optional[str] = None
14
14
  multi_choices: Optional[List[str]] = None
15
15
  id: Optional[str] = None
16
+ messages: Optional[List[dict]] = None
16
17
 
17
18
  def to_dict(self) -> Dict:
18
19
  return {k: v for k, v in asdict(self).items() if v is not None}
@@ -21,7 +22,7 @@ class PromptData:
21
22
  def preprocess_decorator(func):
22
23
 
23
24
  @wraps(func)
24
- def wrapper(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT):
25
+ def wrapper(self, result: str, raw_input_d: dict = None, **kwargs):
25
26
  if result is None:
26
27
  result = ''
27
28
  filters = self.config_kwargs.get('filters', None)
@@ -29,6 +30,31 @@ def preprocess_decorator(func):
29
30
  # Apply filters to the resultply filters to the result
30
31
  for filter_name, filter_value in filters.items():
31
32
  result = Filter.apply(filter_name, result, filter_value)
32
- return func(self, result, raw_input_d, eval_type)
33
+ return func(self, result, raw_input_d, **kwargs)
33
34
 
34
35
  return wrapper
36
+
37
+
38
+ def load_file_with_extension(file_path: Union[str, List[str]]) -> List[dict]:
39
+ """
40
+ Load a file with a specific extension and return its content as a list of dictionaries.
41
+ """
42
+ import json
43
+ import os
44
+
45
+ if isinstance(file_path, str):
46
+ file_path = [file_path]
47
+
48
+ data = []
49
+ for path in file_path:
50
+ if not os.path.exists(path):
51
+ raise FileNotFoundError(f'The file {path} does not exist.')
52
+
53
+ with open(path, 'r', encoding='utf-8') as f:
54
+ if path.endswith('.json'):
55
+ data.extend(json.load(f))
56
+ elif path.endswith('.jsonl'):
57
+ data.extend([json.loads(line) for line in f])
58
+ elif path.endswith('.txt'):
59
+ data.extend([{'text': f.read()}])
60
+ return data
File without changes
@@ -0,0 +1,57 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import EvalType, OutputType
3
+ from evalscope.metrics import exact_match
4
+ from evalscope.utils.utils import ResponseParser
5
+
6
+
7
+ @Benchmark.register(
8
+ name='winogrande',
9
+ pretty_name='Winogrande',
10
+ dataset_id='AI-ModelScope/winogrande_val',
11
+ model_adapter=OutputType.GENERATION,
12
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
13
+ metric_list=['AverageAccuracy'],
14
+ few_shot_num=0,
15
+ train_split=None,
16
+ eval_split='validation',
17
+ prompt_template='Question: {query}\nA. {option1}\nB. {option2}\nAnswer:', # noqa: E501
18
+ )
19
+ class WinograndeAdapter(DataAdapter):
20
+
21
+ def __init__(self, **kwargs):
22
+ super().__init__(**kwargs)
23
+
24
+ self.choices = ['A', 'B']
25
+
26
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
27
+ """
28
+ Generate model prompt from input data.
29
+ """
30
+ prompt = self.prompt_template.format(
31
+ query=input_d['sentence'],
32
+ option1=input_d['option1'],
33
+ option2=input_d['option2'],
34
+ )
35
+ return self.gen_prompt_data(prompt)
36
+
37
+ def get_gold_answer(self, input_d: dict) -> str:
38
+ """
39
+ Parse the raw input labels (gold).
40
+ """
41
+ answer_index = int(input_d['answer']) - 1
42
+ return self.choices[answer_index]
43
+
44
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
45
+ """
46
+ Parse the predicted result and extract proper answer.
47
+ """
48
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
49
+ return result
50
+ else:
51
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
52
+
53
+ def match(self, gold: str, pred: str) -> float:
54
+ """
55
+ Match the gold answer and the predicted answer.
56
+ """
57
+ return exact_match(gold=gold, pred=pred)
@@ -21,13 +21,13 @@ class StartAppCMD(CLICommand):
21
21
  def define_args(parsers: ArgumentParser):
22
22
  """ define args for create pipeline template command.
23
23
  """
24
- from evalscope.report import add_argument
24
+ from evalscope.app import add_argument
25
25
 
26
26
  parser = parsers.add_parser(StartAppCMD.name)
27
27
  add_argument(parser)
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.report.app import create_app
31
+ from evalscope.app import create_app
32
32
 
33
33
  create_app(self.args)
@@ -1,3 +1,35 @@
1
- from evalscope.collections.evaluator import EvaluatorCollection
2
- from evalscope.collections.sampler import StratifiedSampler, UniformSampler, WeightedSampler
3
- from evalscope.collections.schema import CollectionSchema, DatasetInfo
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
3
+
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .evaluator import EvaluatorCollection
8
+ from .sampler import StratifiedSampler, UniformSampler, WeightedSampler
9
+ from .schema import CollectionSchema, DatasetInfo
10
+
11
+ else:
12
+ _import_structure = {
13
+ 'evaluator': [
14
+ 'EvaluatorCollection',
15
+ ],
16
+ 'sampler': [
17
+ 'StratifiedSampler',
18
+ 'UniformSampler',
19
+ 'WeightedSampler',
20
+ ],
21
+ 'schema': [
22
+ 'CollectionSchema',
23
+ 'DatasetInfo',
24
+ ],
25
+ }
26
+
27
+ import sys
28
+
29
+ sys.modules[__name__] = _LazyModule(
30
+ __name__,
31
+ globals()['__file__'],
32
+ _import_structure,
33
+ module_spec=__spec__,
34
+ extra_objects={},
35
+ )
@@ -7,7 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
7
7
  from copy import deepcopy
8
8
  from tabulate import tabulate
9
9
  from tqdm import tqdm
10
- from typing import List
10
+ from typing import Any, Dict, List
11
11
 
12
12
  from evalscope.benchmarks import Benchmark, DataAdapter
13
13
  from evalscope.collections.sampler import DatasetEntry
@@ -70,9 +70,13 @@ class EvaluatorCollection:
70
70
  dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
71
71
  raw_dataset = self.data_adapter.load()
72
72
  # random limit the dataset
73
- if self.task_cfg.limit:
74
- raw_dataset = random.sample(raw_dataset,
75
- self.task_cfg.limit) if len(raw_dataset) > self.task_cfg.limit else raw_dataset
73
+ limit = len(raw_dataset)
74
+ if self.task_cfg.limit is not None:
75
+ if isinstance(self.task_cfg.limit, int):
76
+ limit = self.task_cfg.limit
77
+ elif isinstance(self.task_cfg.limit, float):
78
+ limit = int(len(raw_dataset) * self.task_cfg.limit)
79
+ raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
76
80
  # index dataset
77
81
  datasets = []
78
82
  for sample in raw_dataset:
@@ -179,32 +183,43 @@ class EvaluatorCollection:
179
183
  logger.info(f'{level} Report:\n{table}')
180
184
 
181
185
  report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
186
+ # Make report analysis
187
+ if self.task_cfg.analysis_report:
188
+ logger.info('Generating report analysis, please wait ...')
189
+ analysis = report.generate_analysis(self.task_cfg.judge_model_args)
190
+ logger.info('Report analysis:\n%s', analysis)
191
+ else:
192
+ logger.info('Skipping report analysis (`analysis_report=False`).')
193
+
182
194
  # save report to JSON file
183
195
  report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
184
- os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
185
- with open(report_file_path, 'w', encoding='utf-8') as f:
186
- json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
196
+ report.to_json(report_file_path)
197
+
198
+ logger.info(f'Report saved to {report_file_path}')
187
199
  return report
188
200
 
189
201
  def _filter_answer(self, pred_file_path):
190
202
  answer_dict = defaultdict(dict)
191
203
  if self.task_cfg.use_cache and os.path.exists(pred_file_path):
192
204
  answers_list = jsonl_to_list(pred_file_path)
205
+ # Create a set of sample indices for which we have answers
193
206
  indices = set()
194
207
  for answer in answers_list:
195
208
  index = answer.get(AnswerKeys.INDEX)
196
209
  answer_dict[index] = answer
197
210
  indices.add(index)
198
211
 
199
- data = []
200
- for sample in self.dataset:
201
- if sample.index not in indices:
202
- data.append(sample)
212
+ # Filter dataset to only include samples that don't have answers
213
+ data = [sample for sample in self.dataset if sample.index not in indices]
214
+
215
+ # Initialize name map for the filtered dataset
203
216
  data_map = self._init_name_map(data)
204
217
 
205
218
  logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
206
219
  return answer_dict, data, data_map
207
- return answer_dict, self.dataset, self.dataset_name_map
220
+ else:
221
+ # If cache isn't enabled or file doesn't exist, return the full dataset
222
+ return answer_dict, self.dataset, self.dataset_name_map
208
223
 
209
224
  def get_answers(self):
210
225
  pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
@@ -214,13 +229,16 @@ class EvaluatorCollection:
214
229
  answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
215
230
 
216
231
  eval_batch_size = self.task_cfg.eval_batch_size
232
+ # Process samples and get answers
217
233
  with tqdm(total=len(dataset), desc='Getting answers') as pbar:
218
234
  if self.task_cfg.eval_type == EvalType.SERVICE:
235
+ # Create a thread pool for parallel processing
219
236
  with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
220
237
  futures = []
221
238
  for sample in dataset:
222
239
  evaluator = self.evaluators[sample.dataset_name]
223
240
  futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
241
+ # Process completed tasks
224
242
  for future in as_completed(futures):
225
243
  answer_list, samples = future.result()
226
244
  answers[samples[0].index] = answer_list[0]
@@ -244,35 +262,79 @@ class EvaluatorCollection:
244
262
  pbar.update(len(batch_ids))
245
263
  return answers
246
264
 
247
- def get_reviews(self, answers):
265
+ def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
266
+ """
267
+ Retrieve or generate reviews for given answers.
268
+
269
+ Args:
270
+ answers: Dictionary of answers indexed by sample index.
271
+
272
+ Returns:
273
+ Dictionary of reviews indexed by sample index.
274
+ """
275
+ # Set up the review file path
248
276
  review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
249
277
  os.makedirs(review_file_path, exist_ok=True)
250
278
 
251
- if self.task_cfg.use_cache and os.path.exists(review_file_path):
252
- logger.warning(
253
- f'Ignore use_cache={self.task_cfg.use_cache}, updating the review file: {review_file_path} ...')
254
- if os.path.isdir(review_file_path):
255
- for filename in os.listdir(review_file_path):
256
- file_path = os.path.join(review_file_path, filename)
257
- try:
258
- if os.path.isfile(file_path):
259
- os.remove(file_path)
260
- except Exception as e:
261
- logger.error(f'Error deleting file {file_path}: {e}')
279
+ review_history_map = defaultdict(dict)
280
+
281
+ # Handle caching logic
282
+ if os.path.exists(review_file_path):
283
+ if not self.task_cfg.use_cache:
284
+ # Clear existing reviews if not using cache
285
+ self._clear_review_files(review_file_path)
262
286
  else:
263
- os.remove(review_file_path)
287
+ # Load existing reviews if using cache
288
+ self._load_existing_reviews(review_file_path, review_history_map)
264
289
 
265
- reviews = defaultdict(dict)
290
+ reviews = {}
266
291
  for sample in tqdm(self.dataset, desc='Getting reviews'):
267
- evaluator = self.evaluators[sample.dataset_name]
268
- review_d = evaluator.get_review(answers[sample.index])
292
+ file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
293
+
294
+ if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
295
+ # Use cached review if available
296
+ review_d = review_history_map[file_name][sample.index]
297
+ else:
298
+ # Generate new review
299
+ evaluator = self.evaluators[sample.dataset_name]
300
+ review_d = evaluator.get_review(answers[sample.index])
301
+ # Only save the review if it's not in the cache
302
+ self._save_review(review_file_path, file_name, review_d)
303
+
269
304
  reviews[sample.index] = review_d
270
- dump_jsonl_data(
271
- review_d,
272
- os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
273
- dump_mode=DumpMode.APPEND)
305
+
274
306
  return reviews
275
307
 
308
+ def _clear_review_files(self, review_file_path: str) -> None:
309
+ """Clear existing review files."""
310
+ if os.path.isdir(review_file_path):
311
+ for filename in os.listdir(review_file_path):
312
+ file_path = os.path.join(review_file_path, filename)
313
+ try:
314
+ if os.path.isfile(file_path):
315
+ os.remove(file_path)
316
+ except Exception as e:
317
+ logger.error(f'Error deleting file {file_path}: {e}')
318
+ else:
319
+ os.remove(review_file_path)
320
+
321
+ def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
322
+ """Load existing reviews from files."""
323
+ logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
324
+ if os.path.isdir(review_file_path):
325
+ for filename in os.listdir(review_file_path):
326
+ if '.ipynb_checkpoints' in filename:
327
+ continue
328
+ file_path = os.path.join(review_file_path, filename)
329
+ with open(file_path, 'r') as f:
330
+ review_history = [json.loads(line.strip()) for line in f]
331
+ review_history_map[filename] = {item['index']: item for item in review_history}
332
+
333
+ def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
334
+ """Save a single review to file."""
335
+ file_path = os.path.join(review_file_path, file_name)
336
+ dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
337
+
276
338
  def get_scores(self, reviews) -> float:
277
339
  scores = defaultdict(dict)
278
340
  for sample in tqdm(self.dataset, desc='Getting scores'):