evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (72) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +8 -9
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  12. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  13. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  14. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  16. evalscope/benchmarks/arena_hard/utils.py +162 -0
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  18. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  19. evalscope/benchmarks/data_adapter.py +30 -2
  20. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  21. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
  22. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  23. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  24. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  25. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  26. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  27. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  30. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  31. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  32. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  33. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  34. evalscope/collections/evaluator.py +4 -2
  35. evalscope/config.py +2 -2
  36. evalscope/metrics/llm_judge.py +1 -1
  37. evalscope/models/chat_adapter.py +32 -11
  38. evalscope/perf/arguments.py +30 -9
  39. evalscope/perf/benchmark.py +57 -103
  40. evalscope/perf/http_client.py +2 -3
  41. evalscope/perf/plugin/api/custom_api.py +1 -1
  42. evalscope/perf/plugin/api/openai_api.py +4 -2
  43. evalscope/perf/plugin/datasets/custom.py +4 -1
  44. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  45. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  46. evalscope/perf/plugin/datasets/openqa.py +4 -1
  47. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  48. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  49. evalscope/perf/utils/benchmark_util.py +12 -6
  50. evalscope/perf/utils/db_util.py +3 -3
  51. evalscope/perf/utils/log_utils.py +41 -0
  52. evalscope/report/app.py +11 -11
  53. evalscope/run.py +7 -0
  54. evalscope/summarizer.py +2 -1
  55. evalscope/utils/utils.py +36 -25
  56. evalscope/version.py +2 -2
  57. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
  58. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
  59. tests/cli/test_all.py +36 -27
  60. tests/cli/test_collection.py +2 -1
  61. tests/cli/test_run.py +38 -20
  62. tests/perf/test_perf.py +1 -2
  63. tests/rag/test_clip_benchmark.py +0 -1
  64. tests/rag/test_mteb.py +37 -8
  65. tests/rag/test_ragas.py +33 -27
  66. tests/vlm/test_vlmeval.py +37 -1
  67. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  68. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  69. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  70. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  71. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  72. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -77,7 +77,7 @@ def add_argument(parser: argparse.ArgumentParser):
77
77
  # LLMJudge arguments
78
78
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
79
79
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
80
- parser.add_argument('--judge-worker-num', type=int, default=8, help='The number of workers for the judge model.')
80
+ parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
81
81
  # yapf: enable
82
82
 
83
83
 
@@ -1,4 +1,4 @@
1
- from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
1
+ from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager, Tools
2
2
  from evalscope.backend.rag_eval.utils.clip import VisionModel
3
3
  from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
4
4
  from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
@@ -8,6 +8,12 @@ from evalscope.utils.logger import get_logger
8
8
  logger = get_logger()
9
9
 
10
10
 
11
+ class Tools:
12
+ MTEB = 'mteb'
13
+ RAGAS = 'ragas'
14
+ CLIP_BENCHMARK = 'clip_benchmark'
15
+
16
+
11
17
  class RAGEvalBackendManager(BackendManager):
12
18
 
13
19
  def __init__(self, config: Union[str, dict], **kwargs):
@@ -47,9 +53,19 @@ class RAGEvalBackendManager(BackendManager):
47
53
  from evalscope.backend.rag_eval.ragas.tasks import generate_testset
48
54
 
49
55
  if testset_args is not None:
50
- generate_testset(TestsetGenerationArguments(**testset_args))
56
+ if isinstance(testset_args, dict):
57
+ generate_testset(TestsetGenerationArguments(**testset_args))
58
+ elif isinstance(testset_args, TestsetGenerationArguments):
59
+ generate_testset(testset_args)
60
+ else:
61
+ raise ValueError('Please provide the testset generation arguments.')
51
62
  if eval_args is not None:
52
- rag_eval(EvaluationArguments(**eval_args))
63
+ if isinstance(eval_args, dict):
64
+ rag_eval(EvaluationArguments(**eval_args))
65
+ elif isinstance(eval_args, EvaluationArguments):
66
+ rag_eval(eval_args)
67
+ else:
68
+ raise ValueError('Please provide the evaluation arguments.')
53
69
 
54
70
  @staticmethod
55
71
  def run_clip_benchmark(args):
@@ -59,17 +75,17 @@ class RAGEvalBackendManager(BackendManager):
59
75
 
60
76
  def run(self, *args, **kwargs):
61
77
  tool = self.config_d.pop('tool')
62
- if tool.lower() == 'mteb':
78
+ if tool.lower() == Tools.MTEB:
63
79
  self._check_env('mteb')
64
80
  model_args = self.config_d['model']
65
81
  eval_args = self.config_d['eval']
66
82
  self.run_mteb(model_args, eval_args)
67
- elif tool.lower() == 'ragas':
83
+ elif tool.lower() == Tools.RAGAS:
68
84
  self._check_env('ragas')
69
85
  testset_args = self.config_d.get('testset_generation', None)
70
86
  eval_args = self.config_d.get('eval', None)
71
87
  self.run_ragas(testset_args, eval_args)
72
- elif tool.lower() == 'clip_benchmark':
88
+ elif tool.lower() == Tools.CLIP_BENCHMARK:
73
89
  self._check_env('webdataset')
74
90
  self.run_clip_benchmark(self.config_d['eval'])
75
91
  else:
@@ -20,6 +20,12 @@ class ModelArguments:
20
20
  encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
21
21
  hub: str = 'modelscope' # modelscope or huggingface
22
22
 
23
+ # for API embedding model
24
+ model_name: Optional[str] = None
25
+ api_base: Optional[str] = None
26
+ api_key: Optional[str] = None
27
+ dimensions: Optional[int] = None
28
+
23
29
  def to_dict(self) -> Dict[str, Any]:
24
30
  return {
25
31
  'model_name_or_path': self.model_name_or_path,
@@ -31,6 +37,10 @@ class ModelArguments:
31
37
  'config_kwargs': self.config_kwargs,
32
38
  'encode_kwargs': self.encode_kwargs,
33
39
  'hub': self.hub,
40
+ 'model_name': self.model_name,
41
+ 'api_base': self.api_base,
42
+ 'api_key': self.api_key,
43
+ 'dimensions': self.dimensions,
34
44
  }
35
45
 
36
46
 
@@ -21,7 +21,6 @@ class TestsetGenerationArguments:
21
21
  """
22
22
  generator_llm: Dict = field(default_factory=dict)
23
23
  embeddings: Dict = field(default_factory=dict)
24
- distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
25
24
  # For LLM based evaluation
26
25
  # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
27
26
  # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
@@ -67,9 +67,14 @@ def get_persona(llm, kg, language):
67
67
 
68
68
 
69
69
  def load_data(file_path):
70
- from langchain_community.document_loaders import UnstructuredFileLoader
70
+ import nltk
71
+ from langchain_unstructured import UnstructuredLoader
71
72
 
72
- loader = UnstructuredFileLoader(file_path, mode='single')
73
+ if nltk.data.find('taggers/averaged_perceptron_tagger_eng') is False:
74
+ # need to download nltk data for the first time
75
+ nltk.download('averaged_perceptron_tagger_eng')
76
+
77
+ loader = UnstructuredLoader(file_path)
73
78
  data = loader.load()
74
79
  return data
75
80
 
@@ -2,7 +2,6 @@ import asyncio
2
2
  import os
3
3
  from ragas.llms import BaseRagasLLM
4
4
  from ragas.prompt import PromptMixin, PydanticPrompt
5
- from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
6
5
  from typing import List
7
6
 
8
7
  from evalscope.utils.logger import get_logger
@@ -16,10 +15,6 @@ async def translate_prompt(
16
15
  llm: BaseRagasLLM,
17
16
  adapt_instruction: bool = False,
18
17
  ):
19
- if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
20
- logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
21
- return
22
-
23
18
  if not issubclass(type(prompt_user), PromptMixin):
24
19
  logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
25
20
  return
@@ -1,10 +1,12 @@
1
1
  import os
2
2
  import torch
3
3
  from langchain_core.embeddings import Embeddings
4
+ from langchain_openai.embeddings import OpenAIEmbeddings
4
5
  from sentence_transformers import models
5
6
  from sentence_transformers.cross_encoder import CrossEncoder
6
7
  from sentence_transformers.SentenceTransformer import SentenceTransformer
7
8
  from torch import Tensor
9
+ from tqdm import tqdm
8
10
  from typing import Dict, List, Optional, Union
9
11
 
10
12
  from evalscope.backend.rag_eval.utils.tools import download_model
@@ -18,10 +20,10 @@ class BaseModel(Embeddings):
18
20
 
19
21
  def __init__(
20
22
  self,
21
- model_name_or_path: str,
23
+ model_name_or_path: str = '',
22
24
  max_seq_length: int = 512,
23
25
  prompt: str = '',
24
- revision: Optional[str] = None,
26
+ revision: Optional[str] = 'master',
25
27
  **kwargs,
26
28
  ):
27
29
  self.model_name_or_path = model_name_or_path
@@ -139,7 +141,7 @@ class CrossEncoderModel(BaseModel):
139
141
  max_length=self.max_seq_length,
140
142
  )
141
143
 
142
- def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
144
+ def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
143
145
  self.encode_kwargs.update(kwargs)
144
146
 
145
147
  if len(sentences[0]) == 3: # Note: For mteb retrieval task
@@ -154,6 +156,46 @@ class CrossEncoderModel(BaseModel):
154
156
  return embeddings
155
157
 
156
158
 
159
+ class APIEmbeddingModel(BaseModel):
160
+
161
+ def __init__(self, **kwargs):
162
+ self.model_name = kwargs.get('model_name')
163
+ self.openai_api_base = kwargs.get('api_base')
164
+ self.openai_api_key = kwargs.get('api_key')
165
+ self.dimensions = kwargs.get('dimensions')
166
+
167
+ self.model = OpenAIEmbeddings(
168
+ model=self.model_name,
169
+ openai_api_base=self.openai_api_base,
170
+ openai_api_key=self.openai_api_key,
171
+ dimensions=self.dimensions,
172
+ check_embedding_ctx_length=False)
173
+
174
+ super().__init__(model_name_or_path=self.model_name, **kwargs)
175
+
176
+ self.batch_size = self.encode_kwargs.get('batch_size', 10)
177
+
178
+ def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
179
+ if isinstance(texts, str):
180
+ texts = [texts]
181
+
182
+ embeddings: List[List[float]] = []
183
+ for i in tqdm(range(0, len(texts), self.batch_size)):
184
+ response = self.model.embed_documents(texts[i:i + self.batch_size], chunk_size=self.batch_size)
185
+ embeddings.extend(response)
186
+ return torch.tensor(embeddings)
187
+
188
+ def encode_queries(self, queries, **kwargs):
189
+ return self.encode(queries, **kwargs)
190
+
191
+ def encode_corpus(self, corpus, **kwargs):
192
+ if isinstance(corpus[0], dict):
193
+ input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
194
+ else:
195
+ input_texts = corpus
196
+ return self.encode(input_texts, **kwargs)
197
+
198
+
157
199
  class EmbeddingModel:
158
200
  """Custom embeddings"""
159
201
 
@@ -165,6 +207,10 @@ class EmbeddingModel:
165
207
  revision: Optional[str] = 'master',
166
208
  **kwargs,
167
209
  ):
210
+ if kwargs.get('model_name'):
211
+ # If model_name is provided, use OpenAIEmbeddings
212
+ return APIEmbeddingModel(**kwargs)
213
+
168
214
  # If model path does not exist and hub is 'modelscope', download the model
169
215
  if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
170
216
  model_name_or_path = download_model(model_name_or_path, revision)
@@ -2,11 +2,11 @@ import os
2
2
  from langchain_core.callbacks.manager import CallbackManagerForLLMRun
3
3
  from langchain_core.language_models.llms import LLM as BaseLLM
4
4
  from langchain_openai import ChatOpenAI
5
- from modelscope.utils.hf_util import GenerationConfig
5
+ from transformers.generation.configuration_utils import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models import ChatGenerationModelAdapter
9
+ from evalscope.models import ChatGenerationModelAdapter, LocalModel
10
10
 
11
11
 
12
12
  class LLM:
@@ -16,9 +16,9 @@ class LLM:
16
16
  api_base = kw.get('api_base', None)
17
17
  if api_base:
18
18
  return ChatOpenAI(
19
- model_name=kw.get('model_name', ''),
20
- openai_api_base=api_base,
21
- openai_api_key=kw.get('api_key', 'EMPTY'),
19
+ model=kw.get('model_name', ''),
20
+ base_url=api_base,
21
+ api_key=kw.get('api_key', 'EMPTY'),
22
22
  )
23
23
  else:
24
24
  return LocalLLM(**kw)
@@ -38,8 +38,7 @@ class LocalLLM(BaseLLM):
38
38
  super().__init__(**kw)
39
39
  self.model_name = os.path.basename(self.model_name_or_path)
40
40
  self.model = ChatGenerationModelAdapter(
41
- model_id=self.model_name_or_path,
42
- model_revision=self.model_revision,
41
+ model=LocalModel(model_id=self.model_name_or_path, model_revision=self.model_revision),
43
42
  generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
44
43
  )
45
44
 
@@ -53,8 +52,8 @@ class LocalLLM(BaseLLM):
53
52
  """Run the LLM on the given input."""
54
53
  infer_cfg = {'stop': stop}
55
54
 
56
- response = self.model._model_generate(prompt, infer_cfg)
57
- return response
55
+ response, _ = self.model._model_generate([prompt], infer_cfg=infer_cfg)
56
+ return response[0][0]
58
57
 
59
58
  @property
60
59
  def _identifying_params(self) -> Dict[str, Any]:
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import os
2
3
  import subprocess
3
4
  from functools import partial
4
5
  from typing import Optional, Union
@@ -66,8 +67,9 @@ class VLMEvalKitBackendManager(BackendManager):
66
67
  del remain_cfg['name'] # remove not used args
67
68
  del remain_cfg['type'] # remove not used args
68
69
 
69
- self.valid_models.update({model_type: partial(model_class, model=model_type, **remain_cfg)})
70
- new_model_names.append(model_type)
70
+ norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
71
+ self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
72
+ new_model_names.append(norm_model_type)
71
73
  else:
72
74
  remain_cfg = copy.deepcopy(model_cfg)
73
75
  del remain_cfg['name'] # remove not used args
File without changes
@@ -0,0 +1,109 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import Metric, mean, metric_registry
7
+ from evalscope.metrics.llm_judge import LLMJudge
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ # flake8: noqa
11
+
12
+ logger = get_logger()
13
+
14
+ GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers."""
15
+
16
+ GRADER_TEMPLATE = """
17
+ I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
18
+
19
+ ## Instruction
20
+
21
+ {{
22
+ "instruction": "{instruction}"
23
+ }}
24
+
25
+ ## Model Outputs
26
+
27
+ Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
28
+
29
+ {{
30
+ {{
31
+ "model_identifier": "m",
32
+ "output": "{output_1}"
33
+ }},
34
+ {{
35
+ "model_identifier": "M",
36
+ "output": "{output_2}"
37
+ }}
38
+ }}
39
+
40
+ ## Task
41
+
42
+ Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
43
+
44
+ ## Best Model Identifier
45
+ """.strip() # noqa: E501
46
+
47
+
48
+ @Benchmark.register(
49
+ name='alpaca_eval',
50
+ pretty_name='AlpacaEval2.0',
51
+ dataset_id='AI-ModelScope/alpaca_eval',
52
+ subset_list=['alpaca_eval_gpt4_baseline'],
53
+ metric_list=['winrate'],
54
+ few_shot_num=0,
55
+ train_split=None,
56
+ eval_split='eval')
57
+ class AlpacaEvalAdapter(DataAdapter):
58
+
59
+ def __init__(self, *args, **kwargs):
60
+ super().__init__(*args, **kwargs)
61
+
62
+ # register metrics
63
+ metric_registry.register(Metric(name='winrate', object=mean))
64
+
65
+ # whether to use LLM as a judge
66
+ self.llm_as_a_judge = True
67
+
68
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
69
+ question = input_d['instruction']
70
+ return self.gen_prompt_data(question)
71
+
72
+ def get_gold_answer(self, input_d: dict) -> str:
73
+ return input_d['output']
74
+
75
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
76
+ return result.strip()
77
+
78
+ def match(self, gold: str, pred: str):
79
+ # simple match
80
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
81
+ return None
82
+
83
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> bool:
84
+ raw_input = kwargs.get('raw_input', None)
85
+ instruction = raw_input['instruction']
86
+ # gold is baseline answer 'm', pred is model answer 'M'
87
+ prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=gold, output_2=pred)
88
+ # get grading response
89
+ grading_response = judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
90
+ # parse grading response
91
+ match = re.search(r'(m|M)', grading_response)
92
+ res = match.group(0) if match else None
93
+ if res:
94
+ return res == 'M'
95
+ else:
96
+ logger.info(f'Failed to parse grading response: {prompt=}\n {grading_response=}')
97
+ return None
98
+
99
+ def compute_metric(self, review_res_list: List[bool], **kwargs) -> List[dict]:
100
+ """
101
+ compute weighted mean of the bleu score of all samples
102
+
103
+ Args:
104
+ review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
105
+ """
106
+ # zip dict answers
107
+ res_list = [res for res in review_res_list if res is not None]
108
+
109
+ return super().compute_metric(res_list, **kwargs)
@@ -134,7 +134,7 @@ class ARCAdapter(DataAdapter):
134
134
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
135
135
  return result
136
136
  else:
137
- return ResponseParser.parse_first_option(text=result)
137
+ return ResponseParser.parse_first_option(text=result, options=self.choices)
138
138
 
139
139
  def match(self, gold: str, pred: str) -> float:
140
140
  return exact_match(gold=gold, pred=pred)
File without changes
@@ -0,0 +1,120 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import AnswerKeys
7
+ from evalscope.metrics import Metric, mean, metric_registry
8
+ from evalscope.metrics.llm_judge import LLMJudge
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ # flake8: noqa
12
+
13
+ logger = get_logger()
14
+
15
+ GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
16
+
17
+ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>".strip(
18
+ ) # noqa: E501
19
+
20
+
21
+ @Benchmark.register(
22
+ name='arena_hard',
23
+ pretty_name='ArenaHard',
24
+ dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
25
+ metric_list=['winrate'],
26
+ few_shot_num=0,
27
+ train_split=None,
28
+ eval_split='test')
29
+ class AlpacaEvalAdapter(DataAdapter):
30
+
31
+ def __init__(self, *args, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+
34
+ # register metrics
35
+ metric_registry.register(Metric(name='winrate', object=mean))
36
+
37
+ # whether to use LLM as a judge
38
+ self.llm_as_a_judge = True
39
+
40
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
41
+ question = input_d['question']
42
+ return self.gen_prompt_data(question)
43
+
44
+ def get_gold_answer(self, input_d: dict) -> str:
45
+ return input_d['prediction']
46
+
47
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
48
+ return result.strip()
49
+
50
+ def match(self, gold: str, pred: str):
51
+ # simple match
52
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
53
+ return None
54
+
55
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
56
+ from .utils import post_process_arenahard
57
+
58
+ raw_input = kwargs.get('raw_input', None)
59
+ question = raw_input['question']
60
+ # gold is baseline answer 'A', pred is model answer 'B'
61
+ prompt1 = GRADER_TEMPLATE.format(question=question, answer_1=gold, answer_2=pred)
62
+ # reverse the order
63
+ prompt2 = GRADER_TEMPLATE.format(question=question, answer_1=pred, answer_2=gold)
64
+ # get grading response
65
+ game1_response = judge(prompt1, system_prompt=GRADER_SYSTEM_PROMPT)
66
+ game2_response = judge(prompt2, system_prompt=GRADER_SYSTEM_PROMPT)
67
+ # parse grading response
68
+ res1 = post_process_arenahard(game1_response)
69
+ res2 = post_process_arenahard(game2_response)
70
+ return {
71
+ 'model_a':
72
+ 'gpt4-0314',
73
+ 'model_b':
74
+ 'test_model',
75
+ 'games': [
76
+ {
77
+ 'user_prompt': prompt1,
78
+ 'judgment': game1_response,
79
+ 'score': res1
80
+ },
81
+ {
82
+ 'user_prompt': prompt2,
83
+ 'judgment': game2_response,
84
+ 'score': res2
85
+ },
86
+ ]
87
+ }
88
+
89
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
90
+ """
91
+ compute score of the model
92
+ """
93
+ import pandas as pd
94
+
95
+ from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
96
+
97
+ if isinstance(review_res_list[0], list):
98
+ review_res_list = [item for sublist in review_res_list for item in sublist]
99
+
100
+ battles = pd.concat([get_battles_from_row(res) for res in review_res_list])
101
+
102
+ bootstrap_online_elo = compute_mle_elo(battles)
103
+
104
+ # bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100)
105
+ stats = pd.DataFrame()
106
+ stats['results'] = None
107
+ stats['results'] = stats['results'].astype('object')
108
+
109
+ for i, model in enumerate(bootstrap_online_elo.index):
110
+ # assert model in bootstrap_elo_lu.columns
111
+ stats.at[i, 'model'] = model
112
+ stats.at[i, 'score'] = bootstrap_online_elo[model]
113
+ # stats.at[i, "lower"] = np.percentile(bootstrap_elo_lu[model], 2.5)
114
+ # stats.at[i, "upper"] = np.percentile(bootstrap_elo_lu[model], 97.5)
115
+
116
+ # stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist()
117
+
118
+ score = get_win_rate_column(stats, 'score', 'gpt4-0314').at['test_model']
119
+
120
+ return [{'metric_name': 'winrate', 'score': score, 'num': len(review_res_list)}]