evalscope 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (48) hide show
  1. evalscope/backend/__init__.py +0 -3
  2. evalscope/backend/opencompass/backend_manager.py +2 -0
  3. evalscope/backend/opencompass/tasks/eval_datasets.py +2 -2
  4. evalscope/backend/rag_eval/__init__.py +3 -0
  5. evalscope/backend/rag_eval/backend_manager.py +68 -0
  6. evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
  7. evalscope/backend/rag_eval/cmteb/arguments.py +59 -0
  8. evalscope/backend/rag_eval/cmteb/base.py +89 -0
  9. evalscope/backend/rag_eval/cmteb/task_template.py +83 -0
  10. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
  11. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
  12. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
  13. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +153 -0
  14. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
  15. evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
  16. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +64 -0
  17. evalscope/backend/rag_eval/ragas/__init__.py +2 -0
  18. evalscope/backend/rag_eval/ragas/arguments.py +37 -0
  19. evalscope/backend/rag_eval/ragas/task_template.py +117 -0
  20. evalscope/backend/vlm_eval_kit/backend_manager.py +1 -2
  21. evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
  22. evalscope/benchmarks/benchmark.py +1 -1
  23. evalscope/evaluator/evaluator.py +4 -3
  24. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +19 -0
  25. evalscope/models/api/__init__.py +3 -0
  26. evalscope/models/api/openai_api.py +228 -0
  27. evalscope/perf/http_client.py +5 -5
  28. evalscope/run.py +4 -0
  29. evalscope/third_party/longbench_write/__init__.py +3 -0
  30. evalscope/third_party/longbench_write/eval.py +284 -0
  31. evalscope/third_party/longbench_write/infer.py +217 -0
  32. evalscope/third_party/longbench_write/longbench_write.py +88 -0
  33. evalscope/third_party/longbench_write/resources/__init__.py +1 -0
  34. evalscope/third_party/longbench_write/resources/judge.txt +31 -0
  35. evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
  36. evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
  37. evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
  38. evalscope/third_party/longbench_write/tools/__init__.py +1 -0
  39. evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
  40. evalscope/third_party/longbench_write/utils.py +37 -0
  41. evalscope/utils/logger.py +44 -14
  42. evalscope/utils/task_utils.py +3 -0
  43. evalscope/version.py +2 -2
  44. {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/METADATA +46 -60
  45. {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/RECORD +48 -18
  46. {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/WHEEL +0 -0
  47. {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/entry_points.txt +0 -0
  48. {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,64 @@
1
+ from .Classification import *
2
+ from .Clustering import *
3
+ from .PairClassification import *
4
+ from .Reranking import *
5
+ from .Retrieval import *
6
+ from .STS import *
7
+
8
+ CLS_CLASSIFICATION = {
9
+ "TNews": TNews,
10
+ "IFlyTek": IFlyTek,
11
+ "MultilingualSentiment": MultilingualSentiment,
12
+ "JDReview": JDReview,
13
+ "OnlineShopping": OnlineShopping,
14
+ "Waimai": Waimai,
15
+ }
16
+
17
+ CLS_CLUSTERING = {
18
+ "CLSClusteringS2S": CLSClusteringFastS2S,
19
+ "CLSClusteringP2P": CLSClusteringFastP2P,
20
+ "ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
21
+ "ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
22
+ }
23
+
24
+ CLS_PAIR_CLASSIFICATION = {
25
+ "Ocnli": Ocnli,
26
+ "Cmnli": Cmnli,
27
+ }
28
+
29
+ CLS_RERANKING = {
30
+ "T2Reranking": T2Reranking,
31
+ "MMarcoReranking": MMarcoReranking,
32
+ "CMedQAv1": CMedQAv1,
33
+ "CMedQAv2": CMedQAv2,
34
+ }
35
+
36
+ CLS_RETRIEVAL = {
37
+ "T2Retrieval": T2Retrieval,
38
+ "MMarcoRetrieval": MMarcoRetrieval,
39
+ "DuRetrieval": DuRetrieval,
40
+ "CovidRetrieval": CovidRetrieval,
41
+ "CmedqaRetrieval": CmedqaRetrieval,
42
+ "EcomRetrieval": EcomRetrieval,
43
+ "MedicalRetrieval": MedicalRetrieval,
44
+ "VideoRetrieval": VideoRetrieval,
45
+ }
46
+
47
+ CLS_STS = {
48
+ "ATEC": ATEC,
49
+ "BQ": BQ,
50
+ "LCQMC": LCQMC,
51
+ "PAWSX": PAWSX,
52
+ "STSB": STSB,
53
+ "AFQMC": AFQMC,
54
+ "QBQTC": QBQTC,
55
+ }
56
+
57
+ CLS_DICT = {
58
+ **CLS_CLASSIFICATION,
59
+ **CLS_CLUSTERING,
60
+ **CLS_PAIR_CLASSIFICATION,
61
+ **CLS_RERANKING,
62
+ **CLS_RETRIEVAL,
63
+ **CLS_STS,
64
+ }
@@ -0,0 +1,2 @@
1
+ from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments, EvaluationArguments
2
+ from evalscope.backend.rag_eval.ragas.task_template import testset_generation, rag_eval
@@ -0,0 +1,37 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List, Optional, Union, Dict, Any
3
+
4
+
5
+ @dataclass
6
+ class TestsetGenerationArguments:
7
+ docs: List[str] = field(default_factory=list)
8
+ test_size: int = 10
9
+ output_file: str = "outputs/testset.json"
10
+ """
11
+ For local LLM support, you can use the following fields:
12
+ model_name_or_path: str
13
+ model_revision: str = "master"
14
+ template_type: str = "default"
15
+ generation_config: Optional[Dict]
16
+
17
+ For API LLM support, you can use the following fields:
18
+ model_name="gpt-4o-mini"
19
+ api_base: str = "",
20
+ api_key: Optional[str] = None
21
+ """
22
+ generator_llm: Dict = field(default_factory=dict)
23
+ critic_llm: Dict = field(default_factory=dict)
24
+ embeddings: Dict = field(default_factory=dict)
25
+ distribution: str = field(
26
+ default_factory=lambda: {"simple": 0.5, "multi_context": 0.4, "reasoning": 0.1}
27
+ )
28
+
29
+
30
+ @dataclass
31
+ class EvaluationArguments:
32
+ testset_file: str
33
+ critic_llm: Dict = field(default_factory=dict)
34
+ embeddings: Dict = field(default_factory=dict)
35
+ metrics: List[str] = field(
36
+ default_factory=lambda: ["answer_relevancy", "faithfulness"]
37
+ )
@@ -0,0 +1,117 @@
1
+ import os
2
+ from evalscope.backend.rag_eval import EmbeddingModel, LLM
3
+ from evalscope.utils.logger import get_logger
4
+ from .arguments import TestsetGenerationArguments, EvaluationArguments
5
+
6
+ logger = get_logger()
7
+
8
+
9
+ def testset_generation(args: TestsetGenerationArguments) -> None:
10
+ from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
11
+ from ragas.testset.generator import TestsetGenerator
12
+ from ragas.testset.evolutions import simple, reasoning, multi_context
13
+ from ragas import RunConfig
14
+
15
+ # load data
16
+ file_path = args.docs
17
+ loader = UnstructuredFileLoader(file_path, mode="elements")
18
+ data = loader.load()
19
+
20
+ # generator with models
21
+ generator_llm = LLM.load(**args.generator_llm)
22
+ critic_llm = LLM.load(**args.critic_llm)
23
+ embeddings = EmbeddingModel.load(**args.embeddings)
24
+
25
+ # Change resulting question type distribution
26
+ distributions = {
27
+ simple: args.distribution["simple"],
28
+ multi_context: args.distribution["multi_context"],
29
+ reasoning: args.distribution["reasoning"],
30
+ }
31
+
32
+ generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)
33
+
34
+ runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
35
+ testset = generator.generate_with_langchain_docs(
36
+ data,
37
+ args.test_size,
38
+ distributions,
39
+ with_debugging_logs=True,
40
+ is_async=False,
41
+ run_config=runconfig,
42
+ )
43
+
44
+ # save file
45
+ testset_df = testset.to_pandas()
46
+ output_path = os.path.dirname(args.output_file)
47
+ os.makedirs(output_path, exist_ok=True)
48
+ testset_df.to_json(args.output_file, indent=4, index=False, orient="records")
49
+
50
+ # get answer
51
+ testset_with_answer = get_answer(testset_df, generator_llm)
52
+ testset_with_answer.to_json(
53
+ args.output_file, indent=4, index=False, orient="records"
54
+ )
55
+
56
+
57
+ def get_answer(testset_df, generator_llm):
58
+ template = """You are an assistant for question-answering tasks.
59
+ Use the following pieces of retrieved context to answer the question.
60
+ If you don't know the answer, just say that you don't know.
61
+ Use two sentences maximum and keep the answer concise.
62
+ Question: {question}
63
+ Context: {contexts}
64
+ Answer:
65
+ """
66
+ answers = []
67
+ for index, row in testset_df.iterrows():
68
+ question = row["question"]
69
+ contexts = "\n".join(row["contexts"])
70
+
71
+ # Combine question and contexts as input for the LLM
72
+ input_text = template.format(question=question, contexts=contexts)
73
+
74
+ # Generate the answer using the generator LLM
75
+ answer = generator_llm.invoke(input_text)
76
+ answers.append(answer)
77
+
78
+ testset_df["answer"] = answers
79
+ return testset_df
80
+
81
+
82
+ def rag_eval(
83
+ args: EvaluationArguments,
84
+ ) -> None:
85
+ from datasets import Dataset
86
+ from ragas import evaluate
87
+ from evalscope.backend.rag_eval import EmbeddingModel, LLM
88
+ from ragas import RunConfig
89
+ import importlib
90
+
91
+ def dynamic_import(module_name, *function_names):
92
+ # 动态导入指定模块
93
+ module = importlib.import_module(module_name)
94
+
95
+ functions = [getattr(module, name) for name in function_names]
96
+ return functions
97
+
98
+ llm = LLM.load(**args.critic_llm)
99
+ embedding = EmbeddingModel.load(**args.embeddings)
100
+
101
+ dataset = Dataset.from_json(args.testset_file)
102
+
103
+ runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
104
+ score = evaluate(
105
+ dataset,
106
+ metrics=dynamic_import("ragas.metrics", *args.metrics),
107
+ llm=llm,
108
+ embeddings=embedding,
109
+ run_config=runconfig,
110
+ )
111
+ score_df = score.to_pandas()
112
+ # logger.info(score_df.to_string())
113
+
114
+ output_path = args.testset_file.split(".")[0] + "_score.json"
115
+ score_df.to_json(output_path, indent=4, index=False, orient="records")
116
+
117
+ logger.info(f"Eval score saved to {output_path}")
@@ -1,10 +1,9 @@
1
1
  from typing import Optional, Union
2
- from evalscope.utils import is_module_installed, get_module_path, get_valid_list, yaml_to_dict, json_to_dict
2
+ from evalscope.utils import is_module_installed, get_valid_list
3
3
  from evalscope.backend.base import BackendManager
4
4
  from evalscope.utils.logger import get_logger
5
5
  from functools import partial
6
6
  import subprocess
7
- from dataclasses import dataclass
8
7
  import copy
9
8
 
10
9
  logger = get_logger()
@@ -8,7 +8,7 @@ class CustomDataset:
8
8
 
9
9
  def load_data(self, dataset):
10
10
  # customize the loading of the dataset
11
- data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
11
+ data_path = os.path.join(os.path.expanduser("~/LMUData"), f'{dataset}.tsv')
12
12
  return load(data_path)
13
13
 
14
14
 
@@ -46,7 +46,7 @@ class Benchmark(object):
46
46
 
47
47
  dataset.dataset_name = dataset_name.split('/')[-1]
48
48
  dataset.subset_name = subset
49
- dataset.split = split
49
+ # dataset.split = split
50
50
  return dataset
51
51
  elif hub == 'HuggingFace':
52
52
  # TODO: implement this by xingjun.wxj@alibaba-inc.com
@@ -174,6 +174,7 @@ class Evaluator(object):
174
174
  """
175
175
  assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
176
176
  assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
177
+ assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
177
178
 
178
179
  answers_list = []
179
180
  pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
@@ -244,8 +245,8 @@ class Evaluator(object):
244
245
  answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
245
246
 
246
247
  if debug:
247
- logger.debug(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
248
- logger.debug(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
248
+ logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
249
+ logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
249
250
 
250
251
  answers_list.append(answer_d)
251
252
 
@@ -349,7 +350,7 @@ class Evaluator(object):
349
350
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
350
351
 
351
352
  if debug:
352
- logger.debug(review_d)
353
+ logger.info(review_d)
353
354
 
354
355
  reviews_list.append(review_d)
355
356
 
@@ -31,6 +31,7 @@ In these examples settings.xml lists input files and formats.
31
31
  from __future__ import absolute_import, division, print_function
32
32
  import collections
33
33
  import re
34
+ import os
34
35
 
35
36
  import nltk
36
37
  import numpy as np
@@ -38,6 +39,24 @@ import six
38
39
  from absl import logging
39
40
  from rouge_score import scoring, tokenizers
40
41
  from six.moves import map, range
42
+ from evalscope.utils import get_logger
43
+
44
+ logger = get_logger()
45
+
46
+ # Deal with nltk punkt_tab.zip tokenizer file to avoid downloading issue
47
+ try:
48
+ nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
49
+ os.makedirs(nltk_dir, exist_ok=True)
50
+ punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
51
+ punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
52
+
53
+ if not os.path.exists(punkt_path):
54
+ os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
55
+ os.system(f'unzip {punkt_path} -d {nltk_dir}')
56
+ else:
57
+ logger.info(f'{punkt_path} already exists, skipping download')
58
+ except Exception as e:
59
+ logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
41
60
 
42
61
 
43
62
  class RougeScorer(scoring.BaseScorer):
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.models.api.openai_api import OpenaiApi
@@ -0,0 +1,228 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import json
4
+ import threading
5
+ import time
6
+ from asyncio import Queue
7
+
8
+ import requests
9
+ from typing import Union, List, Optional, Dict
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ from modelscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ class OpenaiApi:
17
+
18
+ def __init__(self,
19
+ model: str,
20
+ openai_api_key,
21
+ openai_api_base,
22
+ logprobs: Optional[bool] = False,
23
+ top_logprobs: Optional[int] = None,
24
+ max_new_tokens: int = 4096,
25
+ temperature: Optional[float] = 0.0,
26
+ repetition_penalty: Optional[float] = 1.0,
27
+ is_chat: bool = True,
28
+ verbose: bool = True,
29
+ retry: int = 3,
30
+ query_per_second: int = 10, # TODO
31
+ **kwargs):
32
+
33
+ self.temperature = temperature
34
+ self.repetition_penalty = repetition_penalty
35
+ self.max_tokens = max_new_tokens
36
+ self.logprobs = logprobs
37
+ self.top_logprobs = top_logprobs
38
+
39
+ self.openai_api_key = openai_api_key
40
+ self.url = openai_api_base
41
+ self.model = model
42
+ self.is_chat = is_chat
43
+ self.retry = retry
44
+ self.verbose = verbose
45
+
46
+ self.token_bucket = TokenBucket(query_per_second, verbose)
47
+
48
+ def generate_simple(self, inputs: Union[List[str]]):
49
+
50
+ def process_one(in_data: str):
51
+
52
+ if self.is_chat:
53
+ data = dict(
54
+ model=self.model,
55
+ messages=[{'role': 'user', 'content': in_data}],
56
+ max_tokens=self.max_tokens,
57
+ n=1,
58
+ logprobs=self.logprobs,
59
+ top_logprobs=self.top_logprobs,
60
+ stop=None,
61
+ temperature=self.temperature,
62
+ repetition_penalty=self.repetition_penalty,
63
+ )
64
+ else:
65
+ data = dict(
66
+ model=self.model,
67
+ prompt=in_data,
68
+ max_tokens=self.max_tokens,
69
+ temperature=self.temperature,
70
+ repetition_penalty=self.repetition_penalty,
71
+ )
72
+
73
+ # todo
74
+ openai_api_key = self.openai_api_key or ''
75
+ header = {'Authorization': f'Bearer ', 'content-type': 'application/json', }
76
+ data = json.dumps(data, ensure_ascii=False)
77
+
78
+ if self.verbose:
79
+ print(f'>>data in generate_simple: {data}')
80
+
81
+ resp = requests.post(self.url, headers=header, data=data)
82
+ resp = resp.json()
83
+ if self.verbose:
84
+ print(f'>>resp in generate_simple: {resp}')
85
+
86
+ if self.logprobs:
87
+ return resp['choices']
88
+ else:
89
+ if self.is_chat:
90
+ return resp['choices'][0]['message']['content'].strip()
91
+ else:
92
+ return resp['choices'][0]['text'].strip()
93
+
94
+ with ThreadPoolExecutor() as executor:
95
+ results = list(executor.map(process_one, inputs))
96
+
97
+ return results
98
+
99
+ def generate(self,
100
+ inputs: Union[List[str], List[List]],
101
+ **kwargs) -> List[str]:
102
+ """
103
+ Generate responses from OpenAI API.
104
+
105
+ Args:
106
+ inputs: The input messages for the model. It can be a string or a list of messages.
107
+ e.g. ['who are you ?', 'what is your name ?']
108
+ e.g. [[{'role': 'user', 'content': 'who are you ?'}], ...]
109
+ kwargs: The optional arguments for the model.
110
+ """
111
+ results = []
112
+ # with ThreadPoolExecutor() as executor:
113
+ # results = list(executor.map(self._generate, inputs))
114
+
115
+ for input in inputs:
116
+ results.append(self._generate(input))
117
+
118
+ return results
119
+
120
+ def _generate(self, messages: Union[str, List[Dict]]) -> str:
121
+
122
+ if isinstance(messages, str):
123
+ messages = [{'role': 'user', 'content': messages}]
124
+
125
+ max_num_retries = 0
126
+ while max_num_retries < self.retry:
127
+ # self.wait()
128
+
129
+ header = {
130
+ 'Authorization': f'Bearer {self.openai_api_key}',
131
+ 'content-type': 'application/json',
132
+ }
133
+
134
+ try:
135
+ if self.is_chat:
136
+ data = dict(
137
+ model=self.model,
138
+ messages=messages,
139
+ max_tokens=self.max_tokens,
140
+ n=1,
141
+ logprobs=self.logprobs,
142
+ top_logprobs=self.top_logprobs,
143
+ stop=None,
144
+ temperature=self.temperature,
145
+ repetition_penalty=self.repetition_penalty,
146
+ )
147
+ else:
148
+ # TODO: This is a temporary solution for non-chat models.
149
+ input_prompts = []
150
+ for msg in messages:
151
+ input_prompts.append(msg['content'])
152
+
153
+ data = dict(
154
+ model=self.model,
155
+ prompt='\n'.join(input_prompts),
156
+ max_tokens=self.max_tokens,
157
+ temperature=self.temperature,
158
+ repetition_penalty=self.repetition_penalty,
159
+ )
160
+
161
+ def remove_none_val(input_d: dict):
162
+ return {k: v for k, v in input_d.items() if v is not None}
163
+ data = remove_none_val(data)
164
+
165
+ if self.verbose:
166
+ logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
167
+ raw_response = requests.post(self.url,
168
+ headers=header,
169
+ data=json.dumps(data, ensure_ascii=False))
170
+
171
+ response = raw_response.json()
172
+ if self.verbose:
173
+ logger.info(f'>> response: {response}')
174
+
175
+ if self.logprobs:
176
+ return response['choices']
177
+ else:
178
+ if self.is_chat:
179
+ return response['choices'][0]['message']['content'].strip()
180
+ else:
181
+ return response['choices'][0]['text'].strip()
182
+
183
+ except Exception as e:
184
+ logger.error(f'Error occurs: {str(e)}')
185
+ max_num_retries += 1
186
+ continue
187
+
188
+ def wait(self):
189
+ return self.token_bucket.get_token()
190
+
191
+
192
+ class TokenBucket:
193
+ """A token bucket for rate limiting.
194
+
195
+ Args:
196
+ query_per_second (float): The rate of the token bucket.
197
+ """
198
+
199
+ def __init__(self, rate, verbose=False):
200
+ self._rate = rate
201
+ self._tokens = threading.Semaphore(0)
202
+ self.started = False
203
+ self._request_queue = Queue()
204
+ self.logger = get_logger()
205
+ self.verbose = verbose
206
+
207
+ def _add_tokens(self):
208
+ """Add tokens to the bucket."""
209
+ while True:
210
+ if self._tokens._value < self._rate:
211
+ self._tokens.release()
212
+ time.sleep(1 / self._rate)
213
+
214
+ def get_token(self):
215
+ """Get a token from the bucket."""
216
+ if not self.started:
217
+ self.started = True
218
+ threading.Thread(target=self._add_tokens, daemon=True).start()
219
+ self._tokens.acquire()
220
+ if self.verbose:
221
+ cur_time = time.time()
222
+ while not self._request_queue.empty():
223
+ if cur_time - self._request_queue.queue[0] > 60:
224
+ self._request_queue.get()
225
+ else:
226
+ break
227
+ self._request_queue.put(cur_time)
228
+ self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
@@ -51,15 +51,15 @@ UNLIMITED_RATE = -1
51
51
 
52
52
 
53
53
  async def on_request_start(session, context, params):
54
- logger.debug(f'Starting request: <{params}>')
54
+ logger.info(f'Starting request: <{params}>')
55
55
 
56
56
 
57
57
  async def on_request_chunk_sent(session, context, params):
58
- logger.debug(f'Request body: {params}')
58
+ logger.info(f'Request body: {params}')
59
59
 
60
60
 
61
61
  async def on_response_chunk_received(session, context, params):
62
- logger.debug(f'Response info: <{params}>')
62
+ logger.info(f'Response info: <{params}>')
63
63
 
64
64
 
65
65
  class AioHttpClient:
@@ -116,7 +116,7 @@ class AioHttpClient:
116
116
  line = line.decode("utf8")
117
117
  line = line.rstrip("\n").rstrip("\r")
118
118
  if self.debug:
119
- logger.debug(line)
119
+ logger.info(line)
120
120
  sse_msg = ServerSentEvent.decode(line)
121
121
  if not sse_msg:
122
122
  continue
@@ -567,7 +567,7 @@ async def send_requests_worker(task_id, request_queue: asyncio.Queue, benchmark_
567
567
  else:
568
568
  if response_data:
569
569
  collected_messages.append(response_data) # save the message
570
- logger.debug(response_data)
570
+ logger.info(response_data)
571
571
  benchmark_data["chunk_times"].append(time.perf_counter())
572
572
 
573
573
  benchmark_data["response_messages"] = collected_messages
evalscope/run.py CHANGED
@@ -207,6 +207,10 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
207
207
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
208
208
  vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
209
209
  vlm_eval_kit_backend_manager.run()
210
+ elif eval_backend == EvalBackend.RAG_EVAL.value:
211
+ from evalscope.backend.rag_eval import RAGEvalBackendManager
212
+ rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
213
+ rag_eval_backend_manager.run()
210
214
  # TODO: Add other evaluation backends
211
215
  elif eval_backend == EvalBackend.THIRD_PARTY.value:
212
216
  raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.third_party.longbench_write.longbench_write import run_task