evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (105) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +10 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +23 -99
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +114 -85
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  26. evalscope/benchmarks/mmlu/__init__.py +0 -5
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  28. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  30. evalscope/benchmarks/race/__init__.py +0 -5
  31. evalscope/benchmarks/race/race_adapter.py +25 -53
  32. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  34. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  35. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  36. evalscope/collections/__init__.py +3 -0
  37. evalscope/collections/evaluator.py +178 -0
  38. evalscope/collections/sampler.py +132 -0
  39. evalscope/collections/schema.py +122 -0
  40. evalscope/config.py +10 -6
  41. evalscope/constants.py +7 -28
  42. evalscope/evaluator/evaluator.py +66 -108
  43. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  44. evalscope/metrics/__init__.py +6 -0
  45. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  46. evalscope/metrics/math_accuracy.py +193 -50
  47. evalscope/metrics/metrics.py +7 -4
  48. evalscope/metrics/rouge_metric.py +13 -8
  49. evalscope/models/__init__.py +14 -1
  50. evalscope/models/base_adapter.py +52 -0
  51. evalscope/models/chat_adapter.py +138 -0
  52. evalscope/models/choice_adapter.py +211 -0
  53. evalscope/models/custom_adapter.py +67 -0
  54. evalscope/models/local_model.py +74 -0
  55. evalscope/models/model.py +141 -0
  56. evalscope/models/server_adapter.py +104 -0
  57. evalscope/perf/arguments.py +1 -0
  58. evalscope/perf/benchmark.py +1 -1
  59. evalscope/perf/main.py +3 -1
  60. evalscope/perf/plugin/api/openai_api.py +51 -47
  61. evalscope/perf/utils/local_server.py +1 -0
  62. evalscope/run.py +37 -66
  63. evalscope/run_arena.py +1 -1
  64. evalscope/utils/__init__.py +1 -1
  65. evalscope/utils/chat_service.py +4 -3
  66. evalscope/utils/io_utils.py +8 -0
  67. evalscope/utils/logger.py +4 -0
  68. evalscope/utils/model_utils.py +10 -0
  69. evalscope/utils/utils.py +3 -25
  70. evalscope/version.py +2 -2
  71. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
  72. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
  73. tests/cli/test_collection.py +53 -0
  74. tests/cli/test_run.py +43 -1
  75. tests/perf/test_perf.py +3 -3
  76. tests/rag/test_mteb.py +3 -2
  77. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  78. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  79. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  80. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  81. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  82. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  83. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  84. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  85. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  86. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  87. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  88. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  89. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  90. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  91. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  92. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  93. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  94. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  95. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  96. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  97. evalscope/models/api/__init__.py +0 -3
  98. evalscope/models/dummy_chat_model.py +0 -49
  99. evalscope/models/model_adapter.py +0 -525
  100. evalscope/models/openai_model.py +0 -103
  101. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  102. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
  103. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
  104. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
  105. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,122 @@
1
+ import copy
2
+ import json
3
+ from dataclasses import asdict, dataclass, field
4
+ from typing import List, Union
5
+
6
+
7
+ @dataclass
8
+ class DatasetInfo:
9
+ name: str
10
+ weight: float = 1.0 # sample weight in each collection
11
+ task_type: str = ''
12
+ tags: List[str] = field(default_factory=list)
13
+ args: dict = field(default_factory=dict)
14
+
15
+ def get_data(self) -> dict:
16
+ from evalscope.benchmarks import Benchmark
17
+
18
+ benchmark_meta = Benchmark.get(self.name)
19
+
20
+ data_adapter = benchmark_meta.get_data_adapter(config=self.args)
21
+ data_dict = data_adapter.load(
22
+ dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
23
+ prompts = data_adapter.gen_prompts(data_dict)
24
+ return prompts
25
+
26
+
27
+ def flatten_weight(collection: 'CollectionSchema', base_weight=1):
28
+ total_weight = sum(dataset.weight for dataset in collection.datasets)
29
+ for dataset in collection.datasets:
30
+ current_weight = dataset.weight / total_weight * base_weight
31
+ if isinstance(dataset, CollectionSchema):
32
+ flatten_weight(dataset, current_weight)
33
+ else:
34
+ dataset.weight = current_weight
35
+
36
+
37
+ def flatten_tags(collection: 'CollectionSchema', parent_names=None):
38
+ if parent_names is None:
39
+ parent_names = []
40
+ current_names = parent_names + [collection.name]
41
+ for dataset in collection.datasets:
42
+ if isinstance(dataset, CollectionSchema):
43
+ flatten_tags(dataset, current_names)
44
+ else:
45
+ # Add all parent CollectionSchema names to the tags of each DatasetInfo
46
+ for name in current_names:
47
+ if name not in dataset.tags:
48
+ dataset.tags.append(name)
49
+
50
+
51
+ def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
52
+ flat_datasets = []
53
+ for dataset in collection.datasets:
54
+ if isinstance(dataset, CollectionSchema):
55
+ flat_datasets.extend(flatten_datasets(dataset))
56
+ else:
57
+ flat_datasets.append(dataset)
58
+ return flat_datasets
59
+
60
+
61
+ @dataclass
62
+ class CollectionSchema:
63
+ name: str
64
+ weight: float = 1.0
65
+ datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
66
+
67
+ def __str__(self):
68
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=4)
69
+
70
+ def to_dict(self):
71
+ return {
72
+ 'name':
73
+ self.name,
74
+ 'weight':
75
+ self.weight,
76
+ 'datasets':
77
+ [asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets],
78
+ }
79
+
80
+ @classmethod
81
+ def from_dict(cls, data):
82
+ instance = cls(name=data.get('name', ''), weight=data.get('weight', 1))
83
+ for dataset in data.get('datasets', []):
84
+ if 'datasets' in dataset:
85
+ instance.datasets.append(CollectionSchema.from_dict(dataset))
86
+ else:
87
+ instance.datasets.append(DatasetInfo(**dataset))
88
+ return instance
89
+
90
+ def dump_json(self, file_path):
91
+ d = self.to_dict()
92
+ with open(file_path, 'w') as f:
93
+ json.dump(d, f, ensure_ascii=False, indent=4)
94
+
95
+ @classmethod
96
+ def from_json(cls, file_path):
97
+ with open(file_path, 'r') as f:
98
+ data = json.load(f)
99
+ return cls.from_dict(data)
100
+
101
+ def flatten(self) -> List[DatasetInfo]:
102
+ collection = copy.deepcopy(self)
103
+ flatten_tags(collection)
104
+ flatten_weight(collection)
105
+ return flatten_datasets(collection)
106
+
107
+
108
+ if __name__ == '__main__':
109
+ schema = CollectionSchema(
110
+ name='reasoning',
111
+ datasets=[
112
+ DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en']),
113
+ DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh'], args={'subset_list': ['logic']})
114
+ ])
115
+ print(schema)
116
+ print(schema.flatten())
117
+ schema.dump_json('outputs/schema.json')
118
+
119
+ schema = CollectionSchema.from_json('outputs/schema.json')
120
+ print(schema)
121
+ for dataset in schema.flatten():
122
+ print(dataset)
evalscope/config.py CHANGED
@@ -31,7 +31,7 @@ DEFAULT_GENERATION_CONFIG = {
31
31
  @dataclass
32
32
  class TaskConfig:
33
33
  # Model-related arguments
34
- model: Union[str, CustomModel, None] = None
34
+ model: Union[str, 'CustomModel', None] = None
35
35
  model_id: Optional[str] = None
36
36
  model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
37
37
 
@@ -40,8 +40,8 @@ class TaskConfig:
40
40
  chat_template: Optional[str] = None
41
41
 
42
42
  # Dataset-related arguments
43
- datasets: Optional[List[str]] = None
44
- dataset_args: Optional[Dict] = field(default_factory=dict)
43
+ datasets: List[str] = field(default_factory=list)
44
+ dataset_args: Dict = field(default_factory=dict)
45
45
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
46
46
  dataset_hub: str = HubType.MODELSCOPE
47
47
 
@@ -64,7 +64,9 @@ class TaskConfig:
64
64
  # Debug and runtime mode arguments
65
65
  debug: bool = False
66
66
  dry_run: bool = False
67
- seed: int = 42
67
+ seed: Optional[int] = 42
68
+ api_url: Optional[str] = None # Only used for server model
69
+ api_key: Optional[str] = 'EMPTY' # Only used for server model
68
70
 
69
71
  def __post_init__(self):
70
72
  if (not self.model_id) and self.model:
@@ -74,7 +76,6 @@ class TaskConfig:
74
76
  self.model_id = os.path.basename(self.model).rstrip(os.sep)
75
77
 
76
78
  def to_dict(self):
77
- # Note: to avoid serialization error for some model instance
78
79
  return self.__dict__
79
80
 
80
81
  def __str__(self):
@@ -114,7 +115,9 @@ class TaskConfig:
114
115
  def from_args(args: Namespace):
115
116
  # Convert Namespace to a dictionary and filter out None values
116
117
  args_dict = {k: v for k, v in vars(args).items() if v is not None}
117
- del args_dict['func'] # Note: compat CLI arguments
118
+
119
+ if 'func' in args_dict:
120
+ del args_dict['func'] # Note: compat CLI arguments
118
121
 
119
122
  return TaskConfig.from_dict(args_dict)
120
123
 
@@ -128,6 +131,7 @@ class TaskConfig:
128
131
  continue
129
132
 
130
133
  task.model = custom_model
134
+ task.model_args = custom_model.config
131
135
  task.model_id = type(custom_model).__name__
132
136
  res_list.append(task)
133
137
 
evalscope/constants.py CHANGED
@@ -135,34 +135,13 @@ class EvalStage:
135
135
  class EvalType:
136
136
 
137
137
  CUSTOM = 'custom'
138
- CHECKPOINT = 'checkpoint'
138
+ CHECKPOINT = 'checkpoint' # native model checkpoint
139
+ SERVICE = 'service' # model service
139
140
 
140
141
 
141
142
  class EvalBackend:
142
-
143
- class _Backend:
144
- # compatible with old version, set 'value'
145
-
146
- def __init__(self, value):
147
- self._value = value
148
-
149
- @property
150
- def value(self):
151
- return self._value
152
-
153
- def __str__(self):
154
- return self._value
155
-
156
- def __repr__(self):
157
- return f"'{self._value}'"
158
-
159
- def __eq__(self, other):
160
- if isinstance(other, str):
161
- return self._value == other
162
- return NotImplemented
163
-
164
- NATIVE = _Backend('Native')
165
- OPEN_COMPASS = _Backend('OpenCompass')
166
- VLM_EVAL_KIT = _Backend('VLMEvalKit')
167
- RAG_EVAL = _Backend('RAGEval')
168
- THIRD_PARTY = _Backend('ThirdParty')
143
+ NATIVE = 'Native'
144
+ OPEN_COMPASS = 'OpenCompass'
145
+ VLM_EVAL_KIT = 'VLMEvalKit'
146
+ RAG_EVAL = 'RAGEval'
147
+ THIRD_PARTY = 'ThirdParty'
@@ -10,9 +10,8 @@ from typing import Any, Dict, List, Optional, Union
10
10
 
11
11
  from evalscope.benchmarks import DataAdapter
12
12
  from evalscope.config import TaskConfig
13
- from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
14
- ReviewKeys)
15
- from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
13
+ from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
14
+ from evalscope.models import BaseModelAdapter, CustomModelAdapter
16
15
  from evalscope.tools.combine_reports import gen_table
17
16
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
17
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
@@ -30,72 +29,63 @@ class Evaluator(object):
30
29
  if the dataset is a local path, e.g. /path/to/your_dataset_name,
31
30
  then the task name will be the basename of the path, which is `your_dataset_name`.
32
31
  data_adapter: DataAdapter, the data adapter for the dataset.
33
- subset_list: list, the subset list for the dataset.
34
32
  model_adapter: BaseModelAdapter, the model adapter for the model.
35
- use_cache: str, path to local cache. Default: None
36
- outputs_dir: OutputsStructure, the outputs dir. Default: None
37
- datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
38
- datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
39
- stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
40
- eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
41
- overall_task_cfg: dict, the overall task config. Default: None
33
+ outputs: OutputsStructure, the outputs dir. Default: None
34
+ task_cfg: TaskConfig, the overall task config. Default: None
42
35
  **kwargs: kwargs.
43
36
  """
44
37
 
45
38
  def __init__(self,
46
39
  dataset_name_or_path: str,
47
40
  data_adapter: DataAdapter,
48
- subset_list: Optional[list] = None,
49
- model_adapter: Optional[BaseModelAdapter] = None,
50
- use_cache: Optional[str] = None,
51
- outputs: Optional[OutputsStructure] = None,
52
- datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
53
- datasets_hub: Optional[str] = HubType.MODELSCOPE,
54
- stage: Optional[str] = EvalStage.ALL,
55
- eval_type: Optional[str] = EvalType.CHECKPOINT,
56
- overall_task_cfg: Optional[TaskConfig] = None,
41
+ model_adapter: BaseModelAdapter,
42
+ outputs: OutputsStructure = None,
43
+ task_cfg: TaskConfig = None,
57
44
  **kwargs):
58
45
 
59
46
  self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
60
47
  self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
61
- self.model_name = overall_task_cfg.model_id
48
+ self.model_name = task_cfg.model_id
62
49
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
63
50
 
64
- self.datasets_dir = os.path.expanduser(datasets_dir)
65
- self.kwargs = kwargs
66
51
  self.data_adapter = data_adapter
67
52
  self.model_adapter = model_adapter
68
- self.eval_type = eval_type
69
- self.stage = stage
70
- self.use_cache = use_cache
71
- self.overall_task_cfg = overall_task_cfg
72
- if isinstance(self.model_adapter, CustomModelAdapter):
73
- self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
74
-
75
- self.model_cfg = self.model_adapter.model_cfg
76
-
53
+ self.model_cfg = model_adapter.model_cfg
54
+ self.eval_type = task_cfg.eval_type
55
+ self.dataset_hub = task_cfg.dataset_hub
56
+ self.stage = task_cfg.stage
57
+ self.use_cache = task_cfg.use_cache
58
+ self.task_cfg = task_cfg
77
59
  # Deal with the output paths
78
60
  self.outputs_structure = outputs
79
61
 
80
- # Load dataset
81
- self.dataset = self.data_adapter.load(
82
- dataset_name_or_path=dataset_name_or_path,
83
- subset_list=subset_list,
84
- work_dir=self.datasets_dir,
85
- datasets_hub=datasets_hub,
86
- **kwargs)
87
-
88
- # Get prompts from dataset
89
- self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
90
- del self.dataset
91
-
92
- def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
62
+ self.kwargs = kwargs
93
63
 
94
- ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
95
- ans[AnswerKeys.ANSWER_ID] = answer_id
96
- ans[AnswerKeys.SUBSET_NAME] = subset_name
64
+ def load_dataset(self):
65
+ dataset = self.data_adapter.load(
66
+ dataset_name_or_path=self.dataset_name_or_path,
67
+ subset_list=self.data_adapter.subset_list,
68
+ work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
69
+ datasets_hub=self.dataset_hub,
70
+ **self.kwargs)
97
71
 
98
- return ans
72
+ # Get prompts from dataset
73
+ prompts = self.data_adapter.gen_prompts(data_dict=dataset)
74
+ return prompts
75
+
76
+ def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
77
+ model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
78
+ input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
79
+ infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
80
+ return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
81
+
82
+ def _process_answer(self, answer_d, input_d, subset_name, answer_id):
83
+ answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
84
+ answer_d[AnswerKeys.ANSWER_ID] = answer_id
85
+ answer_d[AnswerKeys.SUBSET_NAME] = subset_name
86
+ answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
87
+ answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
88
+ return answer_d
99
89
 
100
90
  def get_answers(self,
101
91
  subset_name: str,
@@ -146,57 +136,24 @@ class Evaluator(object):
146
136
  resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
147
137
  inputs=prompts_list, infer_cfg=infer_cfg)
148
138
 
149
- assert len(prompts_list) == len(resp_answers_list), \
150
- f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
151
-
152
- for in_d, resp_d in zip(prompts_list, resp_answers_list):
153
-
154
- # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
155
- model_cfg_str = json.dumps(
156
- OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
157
- ensure_ascii=False)
158
- input_prompt_str = json.dumps(
159
- OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
160
- infer_cfg_str = json.dumps(
161
- OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
162
- answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
163
-
164
- resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
165
- resp_d[AnswerKeys.ANSWER_ID] = answer_id
166
- resp_d[AnswerKeys.SUBSET_NAME] = subset_name
167
- resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
168
- resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
169
-
170
- answers_list.append(resp_d)
171
- dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
139
+ for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
140
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
141
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
142
+ answers_list.append(processed_answer)
143
+ dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
172
144
 
173
145
  else:
174
146
  for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
175
-
176
- # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
177
- model_cfg_str = json.dumps(
178
- OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
179
- ensure_ascii=False)
180
- input_prompt_str = json.dumps(
181
- OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
182
- infer_cfg_str = json.dumps(
183
- OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
184
- answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
185
-
186
- # Get answers
187
- answer_d: dict = self._pred_answer(
188
- input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
189
-
190
- answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
191
- answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
192
- answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
147
+ answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
148
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
149
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
193
150
 
194
151
  if debug:
195
152
  logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
196
- logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
153
+ logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
197
154
 
198
- answers_list.append(answer_d)
199
- dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
155
+ answers_list.append(processed_answer)
156
+ dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
200
157
 
201
158
  logger.info(f'Dump predictions to {pred_file_path}.')
202
159
  return answers_list
@@ -240,6 +197,19 @@ class Evaluator(object):
240
197
 
241
198
  return review_res
242
199
 
200
+ def _generate_review_id(self, answer_d):
201
+ # Gen review_id (concat: answer_id + reviewer_spec)
202
+ answer_id = answer_d[AnswerKeys.ANSWER_ID]
203
+ reviewer_spec = {
204
+ 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
205
+ 'reviewer': ['Evaluator'],
206
+ 'revision': ['default']
207
+ }
208
+ reviewer_spec_str = json.dumps(
209
+ OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
210
+ review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
211
+ return review_id, reviewer_spec
212
+
243
213
  def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
244
214
  """
245
215
  Get reviews from answers.
@@ -263,19 +233,7 @@ class Evaluator(object):
263
233
  logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
264
234
 
265
235
  for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
266
-
267
- # Gen review_id (concat: answer_id + reviewer_spec)
268
- answer_id = answer_d[AnswerKeys.ANSWER_ID]
269
-
270
- reviewer_spec: dict = {
271
- 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
272
- 'reviewer': ['Evaluator'],
273
- 'revision': ['default']
274
- }
275
- reviewer_spec_str = json.dumps(
276
- OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
277
- review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
278
-
236
+ review_id, reviewer_spec = self._generate_review_id(answer_d)
279
237
  # Get review
280
238
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
281
239
 
@@ -283,7 +241,6 @@ class Evaluator(object):
283
241
  logger.info(review_d)
284
242
 
285
243
  reviews_list.append(review_d)
286
-
287
244
  # Dump reviews
288
245
  dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
289
246
 
@@ -379,7 +336,8 @@ class Evaluator(object):
379
336
  stage_answers_dict = {}
380
337
  stage_reviews_dict = {}
381
338
 
382
- for subset_name, prompts_list in self.prompts.items():
339
+ prompts = self.load_dataset()
340
+ for subset_name, prompts_list in prompts.items():
383
341
  limit = kwargs.get('limit', len(prompts_list))
384
342
  prompts_list = prompts_list[:limit]
385
343
 
@@ -8,10 +8,10 @@ import sys
8
8
  import time
9
9
  from abc import ABC, abstractmethod
10
10
  from functools import partial
11
- from typing import Any, List
11
+ from typing import Any, List, Tuple
12
12
 
13
13
  from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
14
- from evalscope.models.openai_model import OpenAIModel
14
+ from evalscope.models.model import OpenAIModel
15
15
  from evalscope.utils import completion_parsers, random_seeded_choice
16
16
  from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
17
17
  from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
@@ -240,7 +240,15 @@ class AutoReviewerGpt4(BaseReviewer):
240
240
  review_text=review_text)
241
241
  return review_result
242
242
 
243
- def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
243
+ def _get_review_pair(self,
244
+ model_a,
245
+ model_b,
246
+ question,
247
+ category,
248
+ ans1,
249
+ ans2,
250
+ dry_run=False,
251
+ **kwargs) -> Tuple[str, Any]:
244
252
  input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
245
253
 
246
254
  if self.reference_list:
@@ -263,7 +271,7 @@ class AutoReviewerGpt4(BaseReviewer):
263
271
  result = (result, None)
264
272
  return review_text, *result
265
273
 
266
- def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> (str, Any):
274
+ def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
267
275
  input_msg = dict(ques=question, category=category, ans1=answer)
268
276
 
269
277
  if self.reference_list:
@@ -1 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, weighted_mean
3
+ from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
4
+
5
+ WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}
6
+ WeightedAverageBLEU = {'name': 'WeightedAverageBLEU', 'object': weighted_mean}
7
+ Pass1 = {'name': 'Pass@1', 'object': weighted_mean}
@@ -55,7 +55,7 @@ try:
55
55
  os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
56
56
  os.system(f'unzip {punkt_path} -d {nltk_dir}')
57
57
  else:
58
- logger.info(f'{punkt_path} already exists, skipping download')
58
+ logger.debug(f'{punkt_path} already exists, skipping download')
59
59
  except Exception as e:
60
60
  logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
61
61