evalscope 0.13.2__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (57) hide show
  1. evalscope/backend/rag_eval/__init__.py +1 -1
  2. evalscope/backend/rag_eval/backend_manager.py +21 -5
  3. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  4. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  5. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  6. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  7. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  8. evalscope/backend/rag_eval/utils/llm.py +4 -4
  9. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  10. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  11. evalscope/benchmarks/data_adapter.py +6 -2
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  13. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  14. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  15. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  16. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  17. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  18. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  19. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  20. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  21. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  22. evalscope/collections/evaluator.py +4 -2
  23. evalscope/config.py +1 -1
  24. evalscope/perf/arguments.py +24 -5
  25. evalscope/perf/benchmark.py +28 -42
  26. evalscope/perf/http_client.py +2 -3
  27. evalscope/perf/plugin/api/custom_api.py +1 -1
  28. evalscope/perf/plugin/api/openai_api.py +2 -2
  29. evalscope/perf/plugin/datasets/custom.py +4 -1
  30. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  31. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  32. evalscope/perf/plugin/datasets/openqa.py +4 -1
  33. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  34. evalscope/perf/utils/benchmark_util.py +12 -6
  35. evalscope/perf/utils/db_util.py +1 -1
  36. evalscope/perf/utils/log_utils.py +41 -0
  37. evalscope/report/app.py +11 -11
  38. evalscope/run.py +7 -0
  39. evalscope/summarizer.py +2 -1
  40. evalscope/utils/utils.py +36 -25
  41. evalscope/version.py +2 -2
  42. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/METADATA +20 -15
  43. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/RECORD +55 -54
  44. tests/cli/test_all.py +4 -4
  45. tests/cli/test_collection.py +2 -1
  46. tests/cli/test_run.py +9 -8
  47. tests/perf/test_perf.py +1 -2
  48. tests/rag/test_clip_benchmark.py +0 -1
  49. tests/rag/test_mteb.py +37 -8
  50. tests/rag/test_ragas.py +29 -26
  51. tests/vlm/test_vlmeval.py +37 -1
  52. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  53. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  54. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  55. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  56. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  57. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
1
+ from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager, Tools
2
2
  from evalscope.backend.rag_eval.utils.clip import VisionModel
3
3
  from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
4
4
  from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
@@ -8,6 +8,12 @@ from evalscope.utils.logger import get_logger
8
8
  logger = get_logger()
9
9
 
10
10
 
11
+ class Tools:
12
+ MTEB = 'mteb'
13
+ RAGAS = 'ragas'
14
+ CLIP_BENCHMARK = 'clip_benchmark'
15
+
16
+
11
17
  class RAGEvalBackendManager(BackendManager):
12
18
 
13
19
  def __init__(self, config: Union[str, dict], **kwargs):
@@ -47,9 +53,19 @@ class RAGEvalBackendManager(BackendManager):
47
53
  from evalscope.backend.rag_eval.ragas.tasks import generate_testset
48
54
 
49
55
  if testset_args is not None:
50
- generate_testset(TestsetGenerationArguments(**testset_args))
56
+ if isinstance(testset_args, dict):
57
+ generate_testset(TestsetGenerationArguments(**testset_args))
58
+ elif isinstance(testset_args, TestsetGenerationArguments):
59
+ generate_testset(testset_args)
60
+ else:
61
+ raise ValueError('Please provide the testset generation arguments.')
51
62
  if eval_args is not None:
52
- rag_eval(EvaluationArguments(**eval_args))
63
+ if isinstance(eval_args, dict):
64
+ rag_eval(EvaluationArguments(**eval_args))
65
+ elif isinstance(eval_args, EvaluationArguments):
66
+ rag_eval(eval_args)
67
+ else:
68
+ raise ValueError('Please provide the evaluation arguments.')
53
69
 
54
70
  @staticmethod
55
71
  def run_clip_benchmark(args):
@@ -59,17 +75,17 @@ class RAGEvalBackendManager(BackendManager):
59
75
 
60
76
  def run(self, *args, **kwargs):
61
77
  tool = self.config_d.pop('tool')
62
- if tool.lower() == 'mteb':
78
+ if tool.lower() == Tools.MTEB:
63
79
  self._check_env('mteb')
64
80
  model_args = self.config_d['model']
65
81
  eval_args = self.config_d['eval']
66
82
  self.run_mteb(model_args, eval_args)
67
- elif tool.lower() == 'ragas':
83
+ elif tool.lower() == Tools.RAGAS:
68
84
  self._check_env('ragas')
69
85
  testset_args = self.config_d.get('testset_generation', None)
70
86
  eval_args = self.config_d.get('eval', None)
71
87
  self.run_ragas(testset_args, eval_args)
72
- elif tool.lower() == 'clip_benchmark':
88
+ elif tool.lower() == Tools.CLIP_BENCHMARK:
73
89
  self._check_env('webdataset')
74
90
  self.run_clip_benchmark(self.config_d['eval'])
75
91
  else:
@@ -20,6 +20,12 @@ class ModelArguments:
20
20
  encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
21
21
  hub: str = 'modelscope' # modelscope or huggingface
22
22
 
23
+ # for API embedding model
24
+ model_name: Optional[str] = None
25
+ api_base: Optional[str] = None
26
+ api_key: Optional[str] = None
27
+ dimensions: Optional[int] = None
28
+
23
29
  def to_dict(self) -> Dict[str, Any]:
24
30
  return {
25
31
  'model_name_or_path': self.model_name_or_path,
@@ -31,6 +37,10 @@ class ModelArguments:
31
37
  'config_kwargs': self.config_kwargs,
32
38
  'encode_kwargs': self.encode_kwargs,
33
39
  'hub': self.hub,
40
+ 'model_name': self.model_name,
41
+ 'api_base': self.api_base,
42
+ 'api_key': self.api_key,
43
+ 'dimensions': self.dimensions,
34
44
  }
35
45
 
36
46
 
@@ -21,7 +21,6 @@ class TestsetGenerationArguments:
21
21
  """
22
22
  generator_llm: Dict = field(default_factory=dict)
23
23
  embeddings: Dict = field(default_factory=dict)
24
- distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
25
24
  # For LLM based evaluation
26
25
  # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
27
26
  # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
@@ -67,9 +67,14 @@ def get_persona(llm, kg, language):
67
67
 
68
68
 
69
69
  def load_data(file_path):
70
- from langchain_community.document_loaders import UnstructuredFileLoader
70
+ import nltk
71
+ from langchain_unstructured import UnstructuredLoader
71
72
 
72
- loader = UnstructuredFileLoader(file_path, mode='single')
73
+ if nltk.data.find('taggers/averaged_perceptron_tagger_eng') is False:
74
+ # need to download nltk data for the first time
75
+ nltk.download('averaged_perceptron_tagger_eng')
76
+
77
+ loader = UnstructuredLoader(file_path)
73
78
  data = loader.load()
74
79
  return data
75
80
 
@@ -2,7 +2,6 @@ import asyncio
2
2
  import os
3
3
  from ragas.llms import BaseRagasLLM
4
4
  from ragas.prompt import PromptMixin, PydanticPrompt
5
- from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
6
5
  from typing import List
7
6
 
8
7
  from evalscope.utils.logger import get_logger
@@ -16,10 +15,6 @@ async def translate_prompt(
16
15
  llm: BaseRagasLLM,
17
16
  adapt_instruction: bool = False,
18
17
  ):
19
- if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
20
- logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
21
- return
22
-
23
18
  if not issubclass(type(prompt_user), PromptMixin):
24
19
  logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
25
20
  return
@@ -1,10 +1,12 @@
1
1
  import os
2
2
  import torch
3
3
  from langchain_core.embeddings import Embeddings
4
+ from langchain_openai.embeddings import OpenAIEmbeddings
4
5
  from sentence_transformers import models
5
6
  from sentence_transformers.cross_encoder import CrossEncoder
6
7
  from sentence_transformers.SentenceTransformer import SentenceTransformer
7
8
  from torch import Tensor
9
+ from tqdm import tqdm
8
10
  from typing import Dict, List, Optional, Union
9
11
 
10
12
  from evalscope.backend.rag_eval.utils.tools import download_model
@@ -18,10 +20,10 @@ class BaseModel(Embeddings):
18
20
 
19
21
  def __init__(
20
22
  self,
21
- model_name_or_path: str,
23
+ model_name_or_path: str = '',
22
24
  max_seq_length: int = 512,
23
25
  prompt: str = '',
24
- revision: Optional[str] = None,
26
+ revision: Optional[str] = 'master',
25
27
  **kwargs,
26
28
  ):
27
29
  self.model_name_or_path = model_name_or_path
@@ -139,7 +141,7 @@ class CrossEncoderModel(BaseModel):
139
141
  max_length=self.max_seq_length,
140
142
  )
141
143
 
142
- def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
144
+ def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
143
145
  self.encode_kwargs.update(kwargs)
144
146
 
145
147
  if len(sentences[0]) == 3: # Note: For mteb retrieval task
@@ -154,6 +156,46 @@ class CrossEncoderModel(BaseModel):
154
156
  return embeddings
155
157
 
156
158
 
159
+ class APIEmbeddingModel(BaseModel):
160
+
161
+ def __init__(self, **kwargs):
162
+ self.model_name = kwargs.get('model_name')
163
+ self.openai_api_base = kwargs.get('api_base')
164
+ self.openai_api_key = kwargs.get('api_key')
165
+ self.dimensions = kwargs.get('dimensions')
166
+
167
+ self.model = OpenAIEmbeddings(
168
+ model=self.model_name,
169
+ openai_api_base=self.openai_api_base,
170
+ openai_api_key=self.openai_api_key,
171
+ dimensions=self.dimensions,
172
+ check_embedding_ctx_length=False)
173
+
174
+ super().__init__(model_name_or_path=self.model_name, **kwargs)
175
+
176
+ self.batch_size = self.encode_kwargs.get('batch_size', 10)
177
+
178
+ def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
179
+ if isinstance(texts, str):
180
+ texts = [texts]
181
+
182
+ embeddings: List[List[float]] = []
183
+ for i in tqdm(range(0, len(texts), self.batch_size)):
184
+ response = self.model.embed_documents(texts[i:i + self.batch_size], chunk_size=self.batch_size)
185
+ embeddings.extend(response)
186
+ return torch.tensor(embeddings)
187
+
188
+ def encode_queries(self, queries, **kwargs):
189
+ return self.encode(queries, **kwargs)
190
+
191
+ def encode_corpus(self, corpus, **kwargs):
192
+ if isinstance(corpus[0], dict):
193
+ input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
194
+ else:
195
+ input_texts = corpus
196
+ return self.encode(input_texts, **kwargs)
197
+
198
+
157
199
  class EmbeddingModel:
158
200
  """Custom embeddings"""
159
201
 
@@ -165,6 +207,10 @@ class EmbeddingModel:
165
207
  revision: Optional[str] = 'master',
166
208
  **kwargs,
167
209
  ):
210
+ if kwargs.get('model_name'):
211
+ # If model_name is provided, use OpenAIEmbeddings
212
+ return APIEmbeddingModel(**kwargs)
213
+
168
214
  # If model path does not exist and hub is 'modelscope', download the model
169
215
  if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
170
216
  model_name_or_path = download_model(model_name_or_path, revision)
@@ -2,7 +2,7 @@ import os
2
2
  from langchain_core.callbacks.manager import CallbackManagerForLLMRun
3
3
  from langchain_core.language_models.llms import LLM as BaseLLM
4
4
  from langchain_openai import ChatOpenAI
5
- from modelscope.utils.hf_util import GenerationConfig
5
+ from transformers.generation.configuration_utils import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
@@ -16,9 +16,9 @@ class LLM:
16
16
  api_base = kw.get('api_base', None)
17
17
  if api_base:
18
18
  return ChatOpenAI(
19
- model_name=kw.get('model_name', ''),
20
- openai_api_base=api_base,
21
- openai_api_key=kw.get('api_key', 'EMPTY'),
19
+ model=kw.get('model_name', ''),
20
+ base_url=api_base,
21
+ api_key=kw.get('api_key', 'EMPTY'),
22
22
  )
23
23
  else:
24
24
  return LocalLLM(**kw)
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import os
2
3
  import subprocess
3
4
  from functools import partial
4
5
  from typing import Optional, Union
@@ -66,8 +67,9 @@ class VLMEvalKitBackendManager(BackendManager):
66
67
  del remain_cfg['name'] # remove not used args
67
68
  del remain_cfg['type'] # remove not used args
68
69
 
69
- self.valid_models.update({model_type: partial(model_class, model=model_type, **remain_cfg)})
70
- new_model_names.append(model_type)
70
+ norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
71
+ self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
72
+ new_model_names.append(norm_model_type)
71
73
  else:
72
74
  remain_cfg = copy.deepcopy(model_cfg)
73
75
  del remain_cfg['name'] # remove not used args
@@ -134,7 +134,7 @@ class ARCAdapter(DataAdapter):
134
134
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
135
135
  return result
136
136
  else:
137
- return ResponseParser.parse_first_option(text=result)
137
+ return ResponseParser.parse_first_option(text=result, options=self.choices)
138
138
 
139
139
  def match(self, gold: str, pred: str) -> float:
140
140
  return exact_match(gold=gold, pred=pred)
@@ -314,11 +314,15 @@ class DataAdapter(ABC):
314
314
  kwargs['metric_list'] = self.metric_list
315
315
  return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
316
316
 
317
- def gen_prompt_data(self, prompt: str, system_prompt: Optional[str] = None, **kwargs) -> dict:
317
+ def gen_prompt_data(self,
318
+ prompt: str,
319
+ system_prompt: Optional[str] = None,
320
+ choices: Optional[List[str]] = None,
321
+ **kwargs) -> dict:
318
322
  if not isinstance(prompt, list):
319
323
  prompt = [prompt]
320
324
  prompt_data = PromptData(
321
- data=prompt, multi_choices=self.choices, system_prompt=system_prompt or self.system_prompt)
325
+ data=prompt, multi_choices=choices or self.choices, system_prompt=system_prompt or self.system_prompt)
322
326
  return prompt_data.to_dict()
323
327
 
324
328
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -40,7 +40,7 @@ class GeneralQAAdapter(DataAdapter):
40
40
  for subset_name in subset_list:
41
41
  data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
42
42
  elif os.path.isfile(dataset_name_or_path):
43
- cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
43
+ cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
44
44
  data_file_dict[cur_subset_name] = dataset_name_or_path
45
45
  else:
46
46
  raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
@@ -108,7 +108,7 @@ class HellaSwagAdapter(DataAdapter):
108
108
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
109
109
  return result
110
110
  else:
111
- return ResponseParser.parse_first_option(result)
111
+ return ResponseParser.parse_first_option(result, options=self.choices)
112
112
 
113
113
  def match(self, gold: str, pred: str) -> float:
114
114
  return exact_match(gold=str(gold), pred=str(pred))
@@ -18,7 +18,6 @@ logger = get_logger()
18
18
  extra_params={
19
19
  'start_date': None,
20
20
  'end_date': None,
21
- 'num_process_evaluate': 1,
22
21
  'timeout': 6
23
22
  },
24
23
  system_prompt=
@@ -33,7 +32,6 @@ class LiveCodeBenchAdapter(DataAdapter):
33
32
 
34
33
  extra_params = kwargs.get('extra_params', {})
35
34
 
36
- self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
37
35
  self.timeout = extra_params.get('timeout', 6)
38
36
  self.start_date = extra_params.get('start_date')
39
37
  self.end_date = extra_params.get('end_date')
@@ -84,7 +82,7 @@ class LiveCodeBenchAdapter(DataAdapter):
84
82
  references,
85
83
  predictions,
86
84
  k_list=[1],
87
- num_process_evaluate=self.num_process_evaluate,
85
+ num_process_evaluate=1,
88
86
  timeout=self.timeout,
89
87
  )
90
88
  return metrics['pass@1'] / 100 # convert to point scale