evalscope 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (48) hide show
  1. evalscope/arguments.py +10 -0
  2. evalscope/backend/rag_eval/utils/llm.py +1 -1
  3. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  5. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  6. evalscope/benchmarks/data_adapter.py +4 -2
  7. evalscope/benchmarks/drop/__init__.py +0 -0
  8. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  9. evalscope/benchmarks/drop/utils.py +59 -0
  10. evalscope/benchmarks/general_qa/general_qa_adapter.py +8 -4
  11. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  12. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  13. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
  14. evalscope/benchmarks/tool_bench/utils.py +202 -0
  15. evalscope/benchmarks/utils.py +3 -2
  16. evalscope/benchmarks/winogrande/__init__.py +0 -0
  17. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  18. evalscope/collections/evaluator.py +76 -26
  19. evalscope/config.py +46 -15
  20. evalscope/evaluator/evaluator.py +48 -14
  21. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  22. evalscope/metrics/llm_judge.py +3 -3
  23. evalscope/metrics/rouge_metric.py +11 -13
  24. evalscope/models/adapters/chat_adapter.py +51 -34
  25. evalscope/models/adapters/server_adapter.py +15 -19
  26. evalscope/perf/arguments.py +14 -5
  27. evalscope/perf/benchmark.py +4 -9
  28. evalscope/perf/main.py +69 -17
  29. evalscope/perf/utils/benchmark_util.py +33 -15
  30. evalscope/perf/utils/db_util.py +32 -20
  31. evalscope/perf/utils/log_utils.py +1 -1
  32. evalscope/perf/utils/rich_display.py +186 -0
  33. evalscope/report/app.py +47 -34
  34. evalscope/report/utils.py +1 -1
  35. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  36. evalscope/utils/deprecation_utils.py +42 -0
  37. evalscope/version.py +2 -2
  38. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/METADATA +49 -25
  39. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/RECORD +48 -38
  40. tests/aigc/test_t2i.py +4 -4
  41. tests/cli/test_all.py +3 -0
  42. tests/cli/test_collection.py +2 -1
  43. tests/cli/test_run.py +37 -14
  44. tests/perf/test_perf.py +27 -2
  45. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/LICENSE +0 -0
  46. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/WHEEL +0 -0
  47. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/entry_points.txt +0 -0
  48. {evalscope-0.15.0.dist-info → evalscope-0.16.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -9,6 +9,15 @@ class ParseStrArgsAction(argparse.Action):
9
9
  def __call__(self, parser, namespace, values, option_string=None):
10
10
  assert isinstance(values, str), 'args should be a string.'
11
11
 
12
+ # try json load first
13
+ try:
14
+ arg_dict = json.loads(values)
15
+ setattr(namespace, self.dest, arg_dict)
16
+ return
17
+ except (json.JSONDecodeError, ValueError):
18
+ pass
19
+
20
+ # If JSON load fails, fall back to parsing as key=value pairs
12
21
  arg_dict = {}
13
22
  for arg in values.strip().split(','):
14
23
  key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
@@ -67,6 +76,7 @@ def add_argument(parser: argparse.ArgumentParser):
67
76
  parser.add_argument('--work-dir', type=str, help='The root cache dir.')
68
77
 
69
78
  # Debug and runtime mode arguments
79
+ parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
70
80
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
71
81
  parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
72
82
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
@@ -52,7 +52,7 @@ class LocalLLM(BaseLLM):
52
52
  """Run the LLM on the given input."""
53
53
  infer_cfg = {'stop': stop}
54
54
 
55
- response, _ = self.model._model_generate([prompt], infer_cfg=infer_cfg)
55
+ response, _ = self.model.predict([{'data': [prompt]}], infer_cfg=infer_cfg)
56
56
  return response[0][0]
57
57
 
58
58
  @property
@@ -34,7 +34,7 @@ class GeneralT2IAdapter(T2IBaseAdapter):
34
34
  subset_list = subset_list or self.subset_list
35
35
 
36
36
  data_file_dict = defaultdict(str)
37
- data_list = []
37
+ data_item_dict = defaultdict(list)
38
38
 
39
39
  # get data file path and subset name
40
40
  if os.path.isdir(dataset_name_or_path):
@@ -49,10 +49,10 @@ class GeneralT2IAdapter(T2IBaseAdapter):
49
49
  # load data from local disk
50
50
  try:
51
51
  for subset_name, file_path in data_file_dict.items():
52
- data_list.extend(jsonl_to_list(file_path))
52
+ data_item_dict[subset_name] = jsonl_to_list(file_path)
53
53
  except Exception as e:
54
54
  raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
55
55
 
56
- data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
56
+ data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
57
57
 
58
58
  return data_dict
@@ -96,12 +96,6 @@ class AlpacaEvalAdapter(DataAdapter):
96
96
  return None
97
97
 
98
98
  def compute_metric(self, review_res_list: List[bool], **kwargs) -> List[dict]:
99
- """
100
- compute weighted mean of the bleu score of all samples
101
-
102
- Args:
103
- review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
104
- """
105
99
  # zip dict answers
106
100
  res_list = [res for res in review_res_list if res is not None]
107
101
 
@@ -148,6 +148,7 @@ class ChineseSimpleQAAdapter(DataAdapter):
148
148
  'is_correct': 1 if res == 'A' else 0,
149
149
  'is_incorrect': 1 if res == 'B' else 0,
150
150
  'is_not_attempted': 1 if res == 'C' else 0,
151
+ 'judge_response': grading_response,
151
152
  }
152
153
 
153
154
  def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
@@ -258,7 +258,7 @@ class DataAdapter(ABC):
258
258
  avg_res: Dict[str, List[float]]
259
259
 
260
260
  """
261
- if isinstance(review_res_list[0], list):
261
+ if len(review_res_list) > 0 and isinstance(review_res_list[0], list):
262
262
  review_res_list = [item for sublist in review_res_list for item in sublist]
263
263
 
264
264
  items = defaultdict(list)
@@ -322,6 +322,7 @@ class DataAdapter(ABC):
322
322
  choices: Optional[List[str]] = None,
323
323
  index: Optional[Union[int, str]] = None,
324
324
  id: Optional[Union[int, str]] = None,
325
+ messages: Optional[List[dict]] = None,
325
326
  **kwargs) -> dict:
326
327
  data = [prompt] if not isinstance(prompt, list) else prompt
327
328
  prompt_data = PromptData(
@@ -329,7 +330,8 @@ class DataAdapter(ABC):
329
330
  multi_choices=choices or self.choices,
330
331
  system_prompt=system_prompt or self.system_prompt,
331
332
  index=index or 0,
332
- id=id)
333
+ id=id,
334
+ messages=messages)
333
335
  return prompt_data.to_dict()
334
336
 
335
337
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
File without changes
@@ -0,0 +1,133 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.constants import EvalType
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
9
+
10
+ DROP_EXAMPLES = '''Some examples of passages and Q&A are provided below.
11
+
12
+ # Examples
13
+ ---
14
+ Passage: Trunajaya rebellion or Trunajaya War was the ultimately unsuccessful rebellion waged by the Madurese prince Trunajaya and fighters from Makassar against the Mataram Sultanate and its Dutch East India Company supporters in Java during the 1670s. The rebellion was initially successful: the rebels defeated the royal army at Gegodog , captured most of the Javanese north coast, and took the Mataram capital Plered . King Amangkurat I died during the retreat of the royal court. His son and successor, Amangkurat II, requested help from the VOC in exchange for financial remuneration and geopolitical concessions. The VOC\'s subsequent involvement turned the tide of the war. VOC and Mataram forces recovered lost territories and overran Trunajaya\'s new capital at Kediri . However, the rebellion continued until the capture of Trunajaya at the end of 1679, and the defeat, death, or surrender of the other rebel leaders . Trunajaya was killed by Amangkurat II personally in 1680 while a prisoner of the VOC. After his father\'s death in 1677, Amangkurat II also faced rival claims to the throne. The most serious rival was his brother Pangeran Puger, who took the capital Plered in 1677 and did not surrender until 1681.
15
+ Question: How many years was it between Trunajaya\'s capture and his death while prisoner of the VOC?
16
+ Answer: 1
17
+
18
+ ---
19
+ Passage: Led by former Giant Kurt Warner, the defending NFC champions took the field at Giants Stadium against a Giants team still reeling from their bad loss in New Orleans. The Giants scored first, sending Jacobs in for a 4-yard touchdown run following a Terrell Thomas interception. Later, Arizona running back Beanie Wells scored his first career touchdown on a 13-yard rush. Manning responded by throwing a 62-yard touchdown to Nicks for his longest reception of the year. In the second half, the Cardinals\' Tim Hightower and Jason Wright scored touchdowns. But it was turnovers that decided this game; Manning\'s 3 interceptions were as many as he had thrown all season. The Giants scored only 3 points in the second half, ending the game on an interception to Antrel Rolle. The Giants notable streak of 38 consecutive starts by the same offensive line unit was ended here, as offensive tackle Kareem McKenzie missed the game with a groin injury. McKenzie returned the following week.
20
+ Question: Which player made the first score of the game?
21
+ Answer: Jacobs
22
+
23
+ ---
24
+ Passage: Hoping to rebound from their road loss to the Bills, the Chargers flew to Wembley Stadium for the 2008 International Series game with the New Orleans Saints. In the first quarter, San Diego trailed early as kicker Taylor Mehlhaff got a 23-yard field goal. The \'Bolts would respond with kicker Nate Kaeding getting a 33-yard field goal. In the second quarter, New Orleans regained the lead as QB Drew Brees (a former Charger) completed a 12-yard TD pass to WR Devery Henderson (with a failed PAT) and RB Deuce McAllister getting a 1-yard TD run. San Diego answered as QB Philip Rivers completed a 12-yard TD pass to RB LaDainian Tomlinson, but the Saints replied with Brees completing a 30-yard TD pass to WR Lance Moore. The Chargers closed out the half with Rivers completing a 12-yard TD pass to TE Antonio Gates. In the third quarter, New Orleans increased its lead Brees completing a 1-yard TD pass to TE Mark Campbell, after a very controversial Pass interference call on cornerback Cletis Gordon put the Saints on the 1-yard line. The \'Bolts would answer with Kaeding getting a 24-yard field goal. In the fourth quarter, the Saints continued to build its lead as FB Mike Karney got a 1-yard TD run. San Diego tried to rally as Kaeding nailed a 31-yard field goal, Rivers completed a 14-yard TD pass to WR Vincent Jackson, and Brees giving the \'Bolts a safety via an incomplete pass thrown into the back of his own endzone. However, New Orleans\' defense stiffened for the win. With the loss, the Chargers went into their bye week at 3-5.
25
+ Question: How many total yards of touchdown passes did Drew Brees make?
26
+ Answer: 43
27
+
28
+ ''' # noqa: E501
29
+
30
+
31
+ @Benchmark.register(
32
+ name='drop',
33
+ pretty_name='DROP',
34
+ dataset_id='AI-ModelScope/DROP',
35
+ metric_list=['AverageAccuracy'],
36
+ few_shot_num=0,
37
+ train_split=None,
38
+ eval_split='validation',
39
+ prompt_template=
40
+ 'You will be asked to read a passage and answer a question.{drop_examples}# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.', # noqa: E501
41
+ )
42
+ class DROPAdapter(DataAdapter):
43
+
44
+ def __init__(self, **kwargs):
45
+ super().__init__(**kwargs)
46
+
47
+ few_shot_num = kwargs.get('few_shot_num', 0)
48
+ if few_shot_num != 0:
49
+ self.few_shot_num = 3
50
+ logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
51
+ else:
52
+ self.few_shot_num = 0
53
+
54
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
55
+ """
56
+ Generate model prompt from input data.
57
+ """
58
+ drop_examples = '' if self.few_shot_num == 0 else DROP_EXAMPLES
59
+ query = f"Passage: {input_d['passage']}\nQuestion: {input_d['question']}"
60
+ prompt = self.prompt_template.format(
61
+ drop_examples=drop_examples,
62
+ query=query,
63
+ )
64
+ return self.gen_prompt_data(prompt)
65
+
66
+ def get_gold_answer(self, input_d: dict) -> List[str]:
67
+ """
68
+ Parse the raw input labels (gold).
69
+ """
70
+
71
+ def _flatten_validated_answers(validated_answers):
72
+ """Flattens a dict of lists of validated answers.
73
+ {"number": ['1', '8'], ...}
74
+ -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
75
+ """
76
+ valid_answers = []
77
+ for i in range(len(validated_answers['number'])):
78
+ valid_answers.append({
79
+ 'number': validated_answers['number'][i],
80
+ 'date': validated_answers['date'][i],
81
+ 'spans': validated_answers['spans'][i],
82
+ })
83
+ return valid_answers
84
+
85
+ answers = []
86
+ answers_set = set()
87
+ candidates = [input_d['answer']] + _flatten_validated_answers(input_d['validated_answers'])
88
+ for candidate in candidates:
89
+ answer = DROPAdapter.parse_answer(candidate)
90
+ if answer in answers_set:
91
+ continue
92
+ answers_set.add(answer)
93
+ answers.append(answer)
94
+ return answers
95
+
96
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
97
+ """
98
+ Parse the predicted result and extract proper answer.
99
+ """
100
+ match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', result)
101
+ extracted_answer = match.group(1) if match else result
102
+ return extracted_answer
103
+
104
+ def match(self, gold: List[str], pred: str) -> float:
105
+ """
106
+ Match the gold answer and the predicted answer.
107
+ """
108
+ from .utils import _answer_to_bags
109
+
110
+ max_em = 0
111
+ for gold_answer in gold:
112
+ # Convert the answers to bags of answers
113
+ predicted_bags = _answer_to_bags(pred)
114
+ gold_bags = _answer_to_bags(gold_answer)
115
+
116
+ if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
117
+ exact_match = 1.0
118
+ else:
119
+ exact_match = 0.0
120
+ # Check if the answer is empty
121
+ if gold_answer[0].strip():
122
+ max_em = max(max_em, exact_match)
123
+
124
+ return max_em
125
+
126
+ @staticmethod
127
+ def parse_answer(answer):
128
+ # NOTE: Everything is returned as a tuple for uniformity and hashability.
129
+ if answer['number'] != '':
130
+ return (str(answer['number']), )
131
+ if answer['spans'] != []:
132
+ return tuple(answer['spans'])
133
+ return (' '.join([answer['date']['day'], answer['date']['month'], answer['date']['year']]).strip(), )
@@ -0,0 +1,59 @@
1
+ import re
2
+ import string
3
+
4
+ _ARTICLES = re.compile(r'\b(a|an|the)\b', re.UNICODE)
5
+
6
+
7
+ def _answer_to_bags(answer):
8
+ if isinstance(answer, (list, tuple)):
9
+ raw_spans = answer
10
+ else:
11
+ raw_spans = [answer]
12
+ normalized_spans = []
13
+ token_bags = []
14
+ for raw_span in raw_spans:
15
+ normalized_span = _normalize(raw_span)
16
+ normalized_spans.append(normalized_span)
17
+ token_bags.append(set(normalized_span.split()))
18
+ return normalized_spans, token_bags
19
+
20
+
21
+ def _is_number(text):
22
+ try:
23
+ float(text)
24
+ return True
25
+ except ValueError:
26
+ return False
27
+
28
+
29
+ def _remove_articles(text):
30
+ return _ARTICLES.sub(' ', text)
31
+
32
+
33
+ def _white_space_fix(text):
34
+ return ' '.join(text.split())
35
+
36
+
37
+ def _remove_punc(text):
38
+ exclude = set(string.punctuation)
39
+ if not _is_number(text):
40
+ return ''.join(ch for ch in text if ch not in exclude)
41
+ else:
42
+ return text
43
+
44
+
45
+ def _fix_number(text):
46
+ return str(float(text)) if _is_number(text) else text
47
+
48
+
49
+ def _tokenize(text):
50
+ return re.split(' |-', text)
51
+
52
+
53
+ def _normalize(answer):
54
+ tokens = [
55
+ _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer)
56
+ ]
57
+ tokens = [token for token in tokens if token.strip()]
58
+ normalized = ' '.join(tokens).strip()
59
+ return normalized
@@ -4,7 +4,7 @@ from collections import defaultdict
4
4
  from typing import List, Optional, Union
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
7
+ from evalscope.metrics import mean
8
8
  from evalscope.utils.io_utils import jsonl_to_list
9
9
  from evalscope.utils.logger import get_logger
10
10
 
@@ -33,7 +33,7 @@ class GeneralQAAdapter(DataAdapter):
33
33
  subset_list = subset_list or self.subset_list
34
34
 
35
35
  data_file_dict = defaultdict(str)
36
- data_list = []
36
+ data_item_dict = defaultdict(list)
37
37
 
38
38
  # get data file path and subset name
39
39
  if os.path.isdir(dataset_name_or_path):
@@ -48,11 +48,11 @@ class GeneralQAAdapter(DataAdapter):
48
48
  # load data from local disk
49
49
  try:
50
50
  for subset_name, file_path in data_file_dict.items():
51
- data_list.extend(jsonl_to_list(file_path))
51
+ data_item_dict[subset_name] = jsonl_to_list(file_path)
52
52
  except Exception as e:
53
53
  raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
54
54
 
55
- data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
55
+ data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
56
56
 
57
57
  return data_dict
58
58
 
@@ -112,9 +112,13 @@ class GeneralQAAdapter(DataAdapter):
112
112
  """
113
113
  res = dict()
114
114
  if 'AverageRouge' in self.metric_list:
115
+ from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
116
+
115
117
  rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
116
118
  res.update(rouge_dict)
117
119
  if 'AverageBLEU' in self.metric_list:
120
+ from evalscope.metrics import bleu_ngram_one_sample
121
+
118
122
  bleu_dict = bleu_ngram_one_sample(pred, gold)
119
123
  res.update(bleu_dict)
120
124
  return res
@@ -148,6 +148,7 @@ class SimpleQAAdapter(DataAdapter):
148
148
  'is_correct': 1 if res == 'A' else 0,
149
149
  'is_incorrect': 1 if res == 'B' else 0,
150
150
  'is_not_attempted': 1 if res == 'C' else 0,
151
+ 'judge_response': grading_response,
151
152
  }
152
153
 
153
154
  def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
File without changes
@@ -0,0 +1,67 @@
1
+ from typing import Dict, List
2
+
3
+ from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from evalscope.constants import EvalType, OutputType
5
+ from evalscope.metrics import Metric, mean, metric_registry
6
+
7
+
8
+ @Benchmark.register(
9
+ name='tool_bench',
10
+ pretty_name='ToolBench-Static',
11
+ dataset_id='AI-ModelScope/ToolBench-Static',
12
+ subset_list=['in_domain', 'out_of_domain'],
13
+ metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
14
+ few_shot_num=0,
15
+ train_split=None,
16
+ eval_split='test',
17
+ )
18
+ class ToolBenchAdapter(DataAdapter):
19
+
20
+ def __init__(self, **kwargs):
21
+ super().__init__(**kwargs)
22
+
23
+ metric_registry.register(Metric(name='Rouge-L', object=mean))
24
+ metric_registry.register(Metric(name='Act.EM', object=mean))
25
+ metric_registry.register(Metric(name='Plan.EM', object=mean))
26
+ metric_registry.register(Metric(name='F1', object=mean))
27
+ metric_registry.register(Metric(name='HalluRate', object=mean))
28
+
29
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
30
+ """
31
+ Generate model prompt from input data.
32
+ """
33
+ messages = input_d['messages']
34
+ # use prepared messages
35
+ return self.gen_prompt_data(prompt='', messages=messages)
36
+
37
+ def get_gold_answer(self, input_d: dict) -> str:
38
+ """
39
+ Parse the raw input labels (gold).
40
+ """
41
+ return input_d
42
+
43
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
44
+ """
45
+ Parse the predicted result and extract proper answer.
46
+ """
47
+ return result
48
+
49
+ def match(self, gold: dict, pred: str) -> Dict:
50
+ """
51
+ Match the gold answer and the predicted answer.
52
+ """
53
+ from .utils import calculate_metrics
54
+
55
+ data = {
56
+ 'target': gold['target'],
57
+ 'predictions': pred,
58
+ 'tools': gold['tools'],
59
+ }
60
+ metrics = calculate_metrics(data)
61
+ return metrics
62
+
63
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
64
+ # aggregate review results
65
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
66
+
67
+ return super().compute_metric(res_dict, **kwargs)
@@ -0,0 +1,202 @@
1
+ import json
2
+ from rouge import Rouge
3
+
4
+
5
+ def evaluate_rougel(cand_list: list, ref_list: list):
6
+ if len(ref_list) == 0:
7
+ return 0
8
+ rouge = Rouge()
9
+ rouge_score = rouge.get_scores(hyps=cand_list, refs=ref_list, avg=True)
10
+ rougel = rouge_score['rouge-l']['f']
11
+ return rougel
12
+
13
+
14
+ def evaluate_action_em(cand_list: list, ref_list: list):
15
+ if len(ref_list) == 0:
16
+ return 0
17
+ em = 0
18
+ for cand, ref in zip(cand_list, ref_list):
19
+ em += (1 if cand == ref else 0)
20
+ return em / len(cand_list)
21
+
22
+
23
+ def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list):
24
+ easy_f1 = []
25
+ hard_f1 = []
26
+ f1 = []
27
+ for i in range(len(action_pred)):
28
+ ref_action = action_ref[i]
29
+ pred_action = action_pred[i]
30
+
31
+ ref_input = ref_list[i]
32
+ cand_input = cand_list[i]
33
+
34
+ if ref_action != pred_action:
35
+ easy_f1.append(0)
36
+ hard_f1.append(0)
37
+ f1.append(0)
38
+ else:
39
+ try:
40
+ ref_input_json = json.loads(ref_input)
41
+ try:
42
+ cand_input_json = json.loads(cand_input)
43
+ half_match = 0
44
+ full_match = 0
45
+ if ref_input_json == {}:
46
+ if cand_input_json == {}:
47
+ easy_f1.append(1)
48
+ f1.append(1)
49
+ else:
50
+ easy_f1.append(0)
51
+ f1.append(0)
52
+ else:
53
+ for k, v in ref_input_json.items():
54
+ if k in cand_input_json.keys():
55
+ if cand_input_json[k] == v:
56
+ full_match += 1
57
+ else:
58
+ half_match += 1
59
+
60
+ recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30)
61
+ precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30)
62
+ hard_f1.append((2 * recall * precision) / (recall + precision))
63
+ f1.append((2 * recall * precision) / (recall + precision))
64
+ except Exception:
65
+ # cand_input = cand_input.replace("\n","").replace("\"","")
66
+ # ref_input = cand_input.replace("\n","").replace("\"","")
67
+ # rouge = Rouge()
68
+ # rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True)
69
+ if ref_input_json == {}:
70
+ easy_f1.append(0)
71
+ else:
72
+ hard_f1.append(0)
73
+ # hard_f1.append(rouge_score["rouge-l"]["f"])
74
+ # f1.append(rouge_score["rouge-l"]["f"])
75
+ f1.append(0)
76
+ except Exception:
77
+ pass
78
+
79
+ # 检查列表是否为空,如果为空则返回0
80
+ easy_f1_avg = sum(easy_f1) / len(easy_f1) if easy_f1 else 0
81
+ hard_f1_avg = sum(hard_f1) / len(hard_f1) if hard_f1 else 0
82
+ f1_avg = sum(f1) / len(f1) if f1 else 0
83
+
84
+ return easy_f1_avg, hard_f1_avg, f1_avg
85
+
86
+
87
+ def parse_action(text):
88
+ action = 'None'
89
+ action_input = '{}'
90
+ if 'Action Input:' in text:
91
+ input_idx = text.rindex('Action Input:')
92
+ action_input = text[input_idx + len('Action Input:'):].strip()
93
+ else:
94
+ action_input = '{}'
95
+
96
+ if 'Action:' in text:
97
+ action_idx = text.rindex('Action:')
98
+ action = text[action_idx + len('Action:'):].strip()
99
+ if 'Action Input:' in action:
100
+ input_idx = action.index('Action Input:')
101
+ action = action[:input_idx].strip()
102
+ else:
103
+ action = 'none'
104
+ return action, action_input
105
+
106
+
107
+ def parse_output(text):
108
+ action, action_input = parse_action(text)
109
+ if action == 'Finish':
110
+ try:
111
+ action_input = json.loads(action_input)
112
+ # print(action_input)
113
+ # print(json.dumps(action_input,indent=2))
114
+ return_type = action_input['return_type']
115
+ if return_type == 'give_answer':
116
+ if 'final_answer' in action_input.keys():
117
+ answer = str(action_input['final_answer'])
118
+ if answer.strip() in ['', '.', ',']:
119
+ answer = 'None'
120
+ else:
121
+ answer = 'None'
122
+ return 'finish', action, action_input, answer
123
+ else:
124
+ return 'give up', None, None, None
125
+ except Exception:
126
+ return 'give up', None, None, None
127
+ else:
128
+ plan = 'call'
129
+ answer = None
130
+ return plan, action, action_input, answer
131
+
132
+
133
+ def calculate_metrics(data):
134
+ """
135
+ Calculate the metrics for the given data.
136
+ """
137
+ plan_ref = []
138
+ plan_pred = []
139
+ hallu_cases = []
140
+ answer_ref = []
141
+ action_ref = []
142
+ action_input_ref = []
143
+ answer_pred = []
144
+ action_pred = []
145
+ action_input_pred = []
146
+ hallu_pred = 0
147
+
148
+ reference = data['target']
149
+ prediction = data['predictions']
150
+ ref_plan, ref_action, ref_input, ref_ans = parse_output(reference)
151
+ # ref_plan: call
152
+ # ref_action: spott
153
+ # ref_input: {"is_id": "city center" }
154
+ # ref_ans: None
155
+
156
+ pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
157
+ if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
158
+ return {}
159
+ if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in data['tools']]:
160
+ return {}
161
+
162
+ if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in data['tools']]:
163
+ hallu_pred += 1
164
+ hallu_cases.append(data)
165
+
166
+ plan_ref.append(ref_plan)
167
+ plan_pred.append(pred_plan)
168
+ if ref_plan == 'give up':
169
+ pass
170
+ elif ref_plan == 'finish':
171
+ answer_ref.append(ref_ans)
172
+ if pred_ans is None:
173
+ answer_pred.append('none')
174
+ else:
175
+ answer_pred.append(pred_ans)
176
+ else:
177
+ action_ref.append(ref_action)
178
+ action_input_ref.append(ref_input)
179
+ if pred_action is None:
180
+ action_pred.append('none')
181
+ else:
182
+ action_pred.append(pred_action)
183
+
184
+ if pred_input is None:
185
+ action_input_pred.append('{}')
186
+ else:
187
+ action_input_pred.append(pred_input)
188
+
189
+ metric = {}
190
+ rouge = evaluate_rougel(answer_pred, answer_ref)
191
+ plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref)
192
+ action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref)
193
+ easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref)
194
+ hallu_rate = hallu_pred
195
+ metric['Act.EM'] = action_em
196
+ metric['F1'] = f1
197
+ metric['HalluRate'] = hallu_rate
198
+ metric['plan_em'] = plan_em
199
+ metric['Easy_F1'] = easy_f1
200
+ metric['Hard_F1'] = hard_f1
201
+ metric['Rouge-L'] = rouge
202
+ return metric
@@ -13,6 +13,7 @@ class PromptData:
13
13
  system_prompt: Optional[str] = None
14
14
  multi_choices: Optional[List[str]] = None
15
15
  id: Optional[str] = None
16
+ messages: Optional[List[dict]] = None
16
17
 
17
18
  def to_dict(self) -> Dict:
18
19
  return {k: v for k, v in asdict(self).items() if v is not None}
@@ -21,7 +22,7 @@ class PromptData:
21
22
  def preprocess_decorator(func):
22
23
 
23
24
  @wraps(func)
24
- def wrapper(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT):
25
+ def wrapper(self, result: str, raw_input_d: dict = None, **kwargs):
25
26
  if result is None:
26
27
  result = ''
27
28
  filters = self.config_kwargs.get('filters', None)
@@ -29,6 +30,6 @@ def preprocess_decorator(func):
29
30
  # Apply filters to the resultply filters to the result
30
31
  for filter_name, filter_value in filters.items():
31
32
  result = Filter.apply(filter_name, result, filter_value)
32
- return func(self, result, raw_input_d, eval_type)
33
+ return func(self, result, raw_input_d, **kwargs)
33
34
 
34
35
  return wrapper
File without changes