evalscope 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +0 -3
- evalscope/backend/opencompass/backend_manager.py +2 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +2 -2
- evalscope/backend/rag_eval/__init__.py +3 -0
- evalscope/backend/rag_eval/backend_manager.py +68 -0
- evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
- evalscope/backend/rag_eval/cmteb/arguments.py +59 -0
- evalscope/backend/rag_eval/cmteb/base.py +89 -0
- evalscope/backend/rag_eval/cmteb/task_template.py +83 -0
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +153 -0
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +64 -0
- evalscope/backend/rag_eval/ragas/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/arguments.py +37 -0
- evalscope/backend/rag_eval/ragas/task_template.py +117 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +1 -2
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/evaluator/evaluator.py +4 -3
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +19 -0
- evalscope/models/api/__init__.py +3 -0
- evalscope/models/api/openai_api.py +228 -0
- evalscope/perf/http_client.py +5 -5
- evalscope/run.py +4 -0
- evalscope/third_party/longbench_write/__init__.py +3 -0
- evalscope/third_party/longbench_write/eval.py +284 -0
- evalscope/third_party/longbench_write/infer.py +217 -0
- evalscope/third_party/longbench_write/longbench_write.py +88 -0
- evalscope/third_party/longbench_write/resources/__init__.py +1 -0
- evalscope/third_party/longbench_write/resources/judge.txt +31 -0
- evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
- evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
- evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
- evalscope/third_party/longbench_write/tools/__init__.py +1 -0
- evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
- evalscope/third_party/longbench_write/utils.py +37 -0
- evalscope/utils/logger.py +44 -14
- evalscope/utils/task_utils.py +3 -0
- evalscope/version.py +2 -2
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/METADATA +46 -60
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/RECORD +48 -18
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/WHEEL +0 -0
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from .Classification import *
|
|
2
|
+
from .Clustering import *
|
|
3
|
+
from .PairClassification import *
|
|
4
|
+
from .Reranking import *
|
|
5
|
+
from .Retrieval import *
|
|
6
|
+
from .STS import *
|
|
7
|
+
|
|
8
|
+
CLS_CLASSIFICATION = {
|
|
9
|
+
"TNews": TNews,
|
|
10
|
+
"IFlyTek": IFlyTek,
|
|
11
|
+
"MultilingualSentiment": MultilingualSentiment,
|
|
12
|
+
"JDReview": JDReview,
|
|
13
|
+
"OnlineShopping": OnlineShopping,
|
|
14
|
+
"Waimai": Waimai,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
CLS_CLUSTERING = {
|
|
18
|
+
"CLSClusteringS2S": CLSClusteringFastS2S,
|
|
19
|
+
"CLSClusteringP2P": CLSClusteringFastP2P,
|
|
20
|
+
"ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
|
|
21
|
+
"ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
CLS_PAIR_CLASSIFICATION = {
|
|
25
|
+
"Ocnli": Ocnli,
|
|
26
|
+
"Cmnli": Cmnli,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
CLS_RERANKING = {
|
|
30
|
+
"T2Reranking": T2Reranking,
|
|
31
|
+
"MMarcoReranking": MMarcoReranking,
|
|
32
|
+
"CMedQAv1": CMedQAv1,
|
|
33
|
+
"CMedQAv2": CMedQAv2,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
CLS_RETRIEVAL = {
|
|
37
|
+
"T2Retrieval": T2Retrieval,
|
|
38
|
+
"MMarcoRetrieval": MMarcoRetrieval,
|
|
39
|
+
"DuRetrieval": DuRetrieval,
|
|
40
|
+
"CovidRetrieval": CovidRetrieval,
|
|
41
|
+
"CmedqaRetrieval": CmedqaRetrieval,
|
|
42
|
+
"EcomRetrieval": EcomRetrieval,
|
|
43
|
+
"MedicalRetrieval": MedicalRetrieval,
|
|
44
|
+
"VideoRetrieval": VideoRetrieval,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
CLS_STS = {
|
|
48
|
+
"ATEC": ATEC,
|
|
49
|
+
"BQ": BQ,
|
|
50
|
+
"LCQMC": LCQMC,
|
|
51
|
+
"PAWSX": PAWSX,
|
|
52
|
+
"STSB": STSB,
|
|
53
|
+
"AFQMC": AFQMC,
|
|
54
|
+
"QBQTC": QBQTC,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
CLS_DICT = {
|
|
58
|
+
**CLS_CLASSIFICATION,
|
|
59
|
+
**CLS_CLUSTERING,
|
|
60
|
+
**CLS_PAIR_CLASSIFICATION,
|
|
61
|
+
**CLS_RERANKING,
|
|
62
|
+
**CLS_RETRIEVAL,
|
|
63
|
+
**CLS_STS,
|
|
64
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List, Optional, Union, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class TestsetGenerationArguments:
|
|
7
|
+
docs: List[str] = field(default_factory=list)
|
|
8
|
+
test_size: int = 10
|
|
9
|
+
output_file: str = "outputs/testset.json"
|
|
10
|
+
"""
|
|
11
|
+
For local LLM support, you can use the following fields:
|
|
12
|
+
model_name_or_path: str
|
|
13
|
+
model_revision: str = "master"
|
|
14
|
+
template_type: str = "default"
|
|
15
|
+
generation_config: Optional[Dict]
|
|
16
|
+
|
|
17
|
+
For API LLM support, you can use the following fields:
|
|
18
|
+
model_name="gpt-4o-mini"
|
|
19
|
+
api_base: str = "",
|
|
20
|
+
api_key: Optional[str] = None
|
|
21
|
+
"""
|
|
22
|
+
generator_llm: Dict = field(default_factory=dict)
|
|
23
|
+
critic_llm: Dict = field(default_factory=dict)
|
|
24
|
+
embeddings: Dict = field(default_factory=dict)
|
|
25
|
+
distribution: str = field(
|
|
26
|
+
default_factory=lambda: {"simple": 0.5, "multi_context": 0.4, "reasoning": 0.1}
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class EvaluationArguments:
|
|
32
|
+
testset_file: str
|
|
33
|
+
critic_llm: Dict = field(default_factory=dict)
|
|
34
|
+
embeddings: Dict = field(default_factory=dict)
|
|
35
|
+
metrics: List[str] = field(
|
|
36
|
+
default_factory=lambda: ["answer_relevancy", "faithfulness"]
|
|
37
|
+
)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from evalscope.backend.rag_eval import EmbeddingModel, LLM
|
|
3
|
+
from evalscope.utils.logger import get_logger
|
|
4
|
+
from .arguments import TestsetGenerationArguments, EvaluationArguments
|
|
5
|
+
|
|
6
|
+
logger = get_logger()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def testset_generation(args: TestsetGenerationArguments) -> None:
|
|
10
|
+
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
|
11
|
+
from ragas.testset.generator import TestsetGenerator
|
|
12
|
+
from ragas.testset.evolutions import simple, reasoning, multi_context
|
|
13
|
+
from ragas import RunConfig
|
|
14
|
+
|
|
15
|
+
# load data
|
|
16
|
+
file_path = args.docs
|
|
17
|
+
loader = UnstructuredFileLoader(file_path, mode="elements")
|
|
18
|
+
data = loader.load()
|
|
19
|
+
|
|
20
|
+
# generator with models
|
|
21
|
+
generator_llm = LLM.load(**args.generator_llm)
|
|
22
|
+
critic_llm = LLM.load(**args.critic_llm)
|
|
23
|
+
embeddings = EmbeddingModel.load(**args.embeddings)
|
|
24
|
+
|
|
25
|
+
# Change resulting question type distribution
|
|
26
|
+
distributions = {
|
|
27
|
+
simple: args.distribution["simple"],
|
|
28
|
+
multi_context: args.distribution["multi_context"],
|
|
29
|
+
reasoning: args.distribution["reasoning"],
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)
|
|
33
|
+
|
|
34
|
+
runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
|
|
35
|
+
testset = generator.generate_with_langchain_docs(
|
|
36
|
+
data,
|
|
37
|
+
args.test_size,
|
|
38
|
+
distributions,
|
|
39
|
+
with_debugging_logs=True,
|
|
40
|
+
is_async=False,
|
|
41
|
+
run_config=runconfig,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# save file
|
|
45
|
+
testset_df = testset.to_pandas()
|
|
46
|
+
output_path = os.path.dirname(args.output_file)
|
|
47
|
+
os.makedirs(output_path, exist_ok=True)
|
|
48
|
+
testset_df.to_json(args.output_file, indent=4, index=False, orient="records")
|
|
49
|
+
|
|
50
|
+
# get answer
|
|
51
|
+
testset_with_answer = get_answer(testset_df, generator_llm)
|
|
52
|
+
testset_with_answer.to_json(
|
|
53
|
+
args.output_file, indent=4, index=False, orient="records"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_answer(testset_df, generator_llm):
|
|
58
|
+
template = """You are an assistant for question-answering tasks.
|
|
59
|
+
Use the following pieces of retrieved context to answer the question.
|
|
60
|
+
If you don't know the answer, just say that you don't know.
|
|
61
|
+
Use two sentences maximum and keep the answer concise.
|
|
62
|
+
Question: {question}
|
|
63
|
+
Context: {contexts}
|
|
64
|
+
Answer:
|
|
65
|
+
"""
|
|
66
|
+
answers = []
|
|
67
|
+
for index, row in testset_df.iterrows():
|
|
68
|
+
question = row["question"]
|
|
69
|
+
contexts = "\n".join(row["contexts"])
|
|
70
|
+
|
|
71
|
+
# Combine question and contexts as input for the LLM
|
|
72
|
+
input_text = template.format(question=question, contexts=contexts)
|
|
73
|
+
|
|
74
|
+
# Generate the answer using the generator LLM
|
|
75
|
+
answer = generator_llm.invoke(input_text)
|
|
76
|
+
answers.append(answer)
|
|
77
|
+
|
|
78
|
+
testset_df["answer"] = answers
|
|
79
|
+
return testset_df
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def rag_eval(
|
|
83
|
+
args: EvaluationArguments,
|
|
84
|
+
) -> None:
|
|
85
|
+
from datasets import Dataset
|
|
86
|
+
from ragas import evaluate
|
|
87
|
+
from evalscope.backend.rag_eval import EmbeddingModel, LLM
|
|
88
|
+
from ragas import RunConfig
|
|
89
|
+
import importlib
|
|
90
|
+
|
|
91
|
+
def dynamic_import(module_name, *function_names):
|
|
92
|
+
# 动态导入指定模块
|
|
93
|
+
module = importlib.import_module(module_name)
|
|
94
|
+
|
|
95
|
+
functions = [getattr(module, name) for name in function_names]
|
|
96
|
+
return functions
|
|
97
|
+
|
|
98
|
+
llm = LLM.load(**args.critic_llm)
|
|
99
|
+
embedding = EmbeddingModel.load(**args.embeddings)
|
|
100
|
+
|
|
101
|
+
dataset = Dataset.from_json(args.testset_file)
|
|
102
|
+
|
|
103
|
+
runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
|
|
104
|
+
score = evaluate(
|
|
105
|
+
dataset,
|
|
106
|
+
metrics=dynamic_import("ragas.metrics", *args.metrics),
|
|
107
|
+
llm=llm,
|
|
108
|
+
embeddings=embedding,
|
|
109
|
+
run_config=runconfig,
|
|
110
|
+
)
|
|
111
|
+
score_df = score.to_pandas()
|
|
112
|
+
# logger.info(score_df.to_string())
|
|
113
|
+
|
|
114
|
+
output_path = args.testset_file.split(".")[0] + "_score.json"
|
|
115
|
+
score_df.to_json(output_path, indent=4, index=False, orient="records")
|
|
116
|
+
|
|
117
|
+
logger.info(f"Eval score saved to {output_path}")
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from typing import Optional, Union
|
|
2
|
-
from evalscope.utils import is_module_installed,
|
|
2
|
+
from evalscope.utils import is_module_installed, get_valid_list
|
|
3
3
|
from evalscope.backend.base import BackendManager
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
from functools import partial
|
|
6
6
|
import subprocess
|
|
7
|
-
from dataclasses import dataclass
|
|
8
7
|
import copy
|
|
9
8
|
|
|
10
9
|
logger = get_logger()
|
|
@@ -8,7 +8,7 @@ class CustomDataset:
|
|
|
8
8
|
|
|
9
9
|
def load_data(self, dataset):
|
|
10
10
|
# customize the loading of the dataset
|
|
11
|
-
data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
|
|
11
|
+
data_path = os.path.join(os.path.expanduser("~/LMUData"), f'{dataset}.tsv')
|
|
12
12
|
return load(data_path)
|
|
13
13
|
|
|
14
14
|
|
|
@@ -46,7 +46,7 @@ class Benchmark(object):
|
|
|
46
46
|
|
|
47
47
|
dataset.dataset_name = dataset_name.split('/')[-1]
|
|
48
48
|
dataset.subset_name = subset
|
|
49
|
-
dataset.split = split
|
|
49
|
+
# dataset.split = split
|
|
50
50
|
return dataset
|
|
51
51
|
elif hub == 'HuggingFace':
|
|
52
52
|
# TODO: implement this by xingjun.wxj@alibaba-inc.com
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -174,6 +174,7 @@ class Evaluator(object):
|
|
|
174
174
|
"""
|
|
175
175
|
assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
|
|
176
176
|
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
177
|
+
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
177
178
|
|
|
178
179
|
answers_list = []
|
|
179
180
|
pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
|
|
@@ -244,8 +245,8 @@ class Evaluator(object):
|
|
|
244
245
|
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
|
|
245
246
|
|
|
246
247
|
if debug:
|
|
247
|
-
logger.
|
|
248
|
-
logger.
|
|
248
|
+
logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
|
|
249
|
+
logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
|
|
249
250
|
|
|
250
251
|
answers_list.append(answer_d)
|
|
251
252
|
|
|
@@ -349,7 +350,7 @@ class Evaluator(object):
|
|
|
349
350
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
350
351
|
|
|
351
352
|
if debug:
|
|
352
|
-
logger.
|
|
353
|
+
logger.info(review_d)
|
|
353
354
|
|
|
354
355
|
reviews_list.append(review_d)
|
|
355
356
|
|
|
@@ -31,6 +31,7 @@ In these examples settings.xml lists input files and formats.
|
|
|
31
31
|
from __future__ import absolute_import, division, print_function
|
|
32
32
|
import collections
|
|
33
33
|
import re
|
|
34
|
+
import os
|
|
34
35
|
|
|
35
36
|
import nltk
|
|
36
37
|
import numpy as np
|
|
@@ -38,6 +39,24 @@ import six
|
|
|
38
39
|
from absl import logging
|
|
39
40
|
from rouge_score import scoring, tokenizers
|
|
40
41
|
from six.moves import map, range
|
|
42
|
+
from evalscope.utils import get_logger
|
|
43
|
+
|
|
44
|
+
logger = get_logger()
|
|
45
|
+
|
|
46
|
+
# Deal with nltk punkt_tab.zip tokenizer file to avoid downloading issue
|
|
47
|
+
try:
|
|
48
|
+
nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
|
|
49
|
+
os.makedirs(nltk_dir, exist_ok=True)
|
|
50
|
+
punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
|
|
51
|
+
punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
|
|
52
|
+
|
|
53
|
+
if not os.path.exists(punkt_path):
|
|
54
|
+
os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
|
|
55
|
+
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
56
|
+
else:
|
|
57
|
+
logger.info(f'{punkt_path} already exists, skipping download')
|
|
58
|
+
except Exception as e:
|
|
59
|
+
logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
|
|
41
60
|
|
|
42
61
|
|
|
43
62
|
class RougeScorer(scoring.BaseScorer):
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from asyncio import Queue
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from typing import Union, List, Optional, Dict
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
11
|
+
from modelscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OpenaiApi:
|
|
17
|
+
|
|
18
|
+
def __init__(self,
|
|
19
|
+
model: str,
|
|
20
|
+
openai_api_key,
|
|
21
|
+
openai_api_base,
|
|
22
|
+
logprobs: Optional[bool] = False,
|
|
23
|
+
top_logprobs: Optional[int] = None,
|
|
24
|
+
max_new_tokens: int = 4096,
|
|
25
|
+
temperature: Optional[float] = 0.0,
|
|
26
|
+
repetition_penalty: Optional[float] = 1.0,
|
|
27
|
+
is_chat: bool = True,
|
|
28
|
+
verbose: bool = True,
|
|
29
|
+
retry: int = 3,
|
|
30
|
+
query_per_second: int = 10, # TODO
|
|
31
|
+
**kwargs):
|
|
32
|
+
|
|
33
|
+
self.temperature = temperature
|
|
34
|
+
self.repetition_penalty = repetition_penalty
|
|
35
|
+
self.max_tokens = max_new_tokens
|
|
36
|
+
self.logprobs = logprobs
|
|
37
|
+
self.top_logprobs = top_logprobs
|
|
38
|
+
|
|
39
|
+
self.openai_api_key = openai_api_key
|
|
40
|
+
self.url = openai_api_base
|
|
41
|
+
self.model = model
|
|
42
|
+
self.is_chat = is_chat
|
|
43
|
+
self.retry = retry
|
|
44
|
+
self.verbose = verbose
|
|
45
|
+
|
|
46
|
+
self.token_bucket = TokenBucket(query_per_second, verbose)
|
|
47
|
+
|
|
48
|
+
def generate_simple(self, inputs: Union[List[str]]):
|
|
49
|
+
|
|
50
|
+
def process_one(in_data: str):
|
|
51
|
+
|
|
52
|
+
if self.is_chat:
|
|
53
|
+
data = dict(
|
|
54
|
+
model=self.model,
|
|
55
|
+
messages=[{'role': 'user', 'content': in_data}],
|
|
56
|
+
max_tokens=self.max_tokens,
|
|
57
|
+
n=1,
|
|
58
|
+
logprobs=self.logprobs,
|
|
59
|
+
top_logprobs=self.top_logprobs,
|
|
60
|
+
stop=None,
|
|
61
|
+
temperature=self.temperature,
|
|
62
|
+
repetition_penalty=self.repetition_penalty,
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
data = dict(
|
|
66
|
+
model=self.model,
|
|
67
|
+
prompt=in_data,
|
|
68
|
+
max_tokens=self.max_tokens,
|
|
69
|
+
temperature=self.temperature,
|
|
70
|
+
repetition_penalty=self.repetition_penalty,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# todo
|
|
74
|
+
openai_api_key = self.openai_api_key or ''
|
|
75
|
+
header = {'Authorization': f'Bearer ', 'content-type': 'application/json', }
|
|
76
|
+
data = json.dumps(data, ensure_ascii=False)
|
|
77
|
+
|
|
78
|
+
if self.verbose:
|
|
79
|
+
print(f'>>data in generate_simple: {data}')
|
|
80
|
+
|
|
81
|
+
resp = requests.post(self.url, headers=header, data=data)
|
|
82
|
+
resp = resp.json()
|
|
83
|
+
if self.verbose:
|
|
84
|
+
print(f'>>resp in generate_simple: {resp}')
|
|
85
|
+
|
|
86
|
+
if self.logprobs:
|
|
87
|
+
return resp['choices']
|
|
88
|
+
else:
|
|
89
|
+
if self.is_chat:
|
|
90
|
+
return resp['choices'][0]['message']['content'].strip()
|
|
91
|
+
else:
|
|
92
|
+
return resp['choices'][0]['text'].strip()
|
|
93
|
+
|
|
94
|
+
with ThreadPoolExecutor() as executor:
|
|
95
|
+
results = list(executor.map(process_one, inputs))
|
|
96
|
+
|
|
97
|
+
return results
|
|
98
|
+
|
|
99
|
+
def generate(self,
|
|
100
|
+
inputs: Union[List[str], List[List]],
|
|
101
|
+
**kwargs) -> List[str]:
|
|
102
|
+
"""
|
|
103
|
+
Generate responses from OpenAI API.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
inputs: The input messages for the model. It can be a string or a list of messages.
|
|
107
|
+
e.g. ['who are you ?', 'what is your name ?']
|
|
108
|
+
e.g. [[{'role': 'user', 'content': 'who are you ?'}], ...]
|
|
109
|
+
kwargs: The optional arguments for the model.
|
|
110
|
+
"""
|
|
111
|
+
results = []
|
|
112
|
+
# with ThreadPoolExecutor() as executor:
|
|
113
|
+
# results = list(executor.map(self._generate, inputs))
|
|
114
|
+
|
|
115
|
+
for input in inputs:
|
|
116
|
+
results.append(self._generate(input))
|
|
117
|
+
|
|
118
|
+
return results
|
|
119
|
+
|
|
120
|
+
def _generate(self, messages: Union[str, List[Dict]]) -> str:
|
|
121
|
+
|
|
122
|
+
if isinstance(messages, str):
|
|
123
|
+
messages = [{'role': 'user', 'content': messages}]
|
|
124
|
+
|
|
125
|
+
max_num_retries = 0
|
|
126
|
+
while max_num_retries < self.retry:
|
|
127
|
+
# self.wait()
|
|
128
|
+
|
|
129
|
+
header = {
|
|
130
|
+
'Authorization': f'Bearer {self.openai_api_key}',
|
|
131
|
+
'content-type': 'application/json',
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
if self.is_chat:
|
|
136
|
+
data = dict(
|
|
137
|
+
model=self.model,
|
|
138
|
+
messages=messages,
|
|
139
|
+
max_tokens=self.max_tokens,
|
|
140
|
+
n=1,
|
|
141
|
+
logprobs=self.logprobs,
|
|
142
|
+
top_logprobs=self.top_logprobs,
|
|
143
|
+
stop=None,
|
|
144
|
+
temperature=self.temperature,
|
|
145
|
+
repetition_penalty=self.repetition_penalty,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
# TODO: This is a temporary solution for non-chat models.
|
|
149
|
+
input_prompts = []
|
|
150
|
+
for msg in messages:
|
|
151
|
+
input_prompts.append(msg['content'])
|
|
152
|
+
|
|
153
|
+
data = dict(
|
|
154
|
+
model=self.model,
|
|
155
|
+
prompt='\n'.join(input_prompts),
|
|
156
|
+
max_tokens=self.max_tokens,
|
|
157
|
+
temperature=self.temperature,
|
|
158
|
+
repetition_penalty=self.repetition_penalty,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def remove_none_val(input_d: dict):
|
|
162
|
+
return {k: v for k, v in input_d.items() if v is not None}
|
|
163
|
+
data = remove_none_val(data)
|
|
164
|
+
|
|
165
|
+
if self.verbose:
|
|
166
|
+
logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
|
|
167
|
+
raw_response = requests.post(self.url,
|
|
168
|
+
headers=header,
|
|
169
|
+
data=json.dumps(data, ensure_ascii=False))
|
|
170
|
+
|
|
171
|
+
response = raw_response.json()
|
|
172
|
+
if self.verbose:
|
|
173
|
+
logger.info(f'>> response: {response}')
|
|
174
|
+
|
|
175
|
+
if self.logprobs:
|
|
176
|
+
return response['choices']
|
|
177
|
+
else:
|
|
178
|
+
if self.is_chat:
|
|
179
|
+
return response['choices'][0]['message']['content'].strip()
|
|
180
|
+
else:
|
|
181
|
+
return response['choices'][0]['text'].strip()
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.error(f'Error occurs: {str(e)}')
|
|
185
|
+
max_num_retries += 1
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
def wait(self):
|
|
189
|
+
return self.token_bucket.get_token()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class TokenBucket:
|
|
193
|
+
"""A token bucket for rate limiting.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
query_per_second (float): The rate of the token bucket.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, rate, verbose=False):
|
|
200
|
+
self._rate = rate
|
|
201
|
+
self._tokens = threading.Semaphore(0)
|
|
202
|
+
self.started = False
|
|
203
|
+
self._request_queue = Queue()
|
|
204
|
+
self.logger = get_logger()
|
|
205
|
+
self.verbose = verbose
|
|
206
|
+
|
|
207
|
+
def _add_tokens(self):
|
|
208
|
+
"""Add tokens to the bucket."""
|
|
209
|
+
while True:
|
|
210
|
+
if self._tokens._value < self._rate:
|
|
211
|
+
self._tokens.release()
|
|
212
|
+
time.sleep(1 / self._rate)
|
|
213
|
+
|
|
214
|
+
def get_token(self):
|
|
215
|
+
"""Get a token from the bucket."""
|
|
216
|
+
if not self.started:
|
|
217
|
+
self.started = True
|
|
218
|
+
threading.Thread(target=self._add_tokens, daemon=True).start()
|
|
219
|
+
self._tokens.acquire()
|
|
220
|
+
if self.verbose:
|
|
221
|
+
cur_time = time.time()
|
|
222
|
+
while not self._request_queue.empty():
|
|
223
|
+
if cur_time - self._request_queue.queue[0] > 60:
|
|
224
|
+
self._request_queue.get()
|
|
225
|
+
else:
|
|
226
|
+
break
|
|
227
|
+
self._request_queue.put(cur_time)
|
|
228
|
+
self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
|
evalscope/perf/http_client.py
CHANGED
|
@@ -51,15 +51,15 @@ UNLIMITED_RATE = -1
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
async def on_request_start(session, context, params):
|
|
54
|
-
logger.
|
|
54
|
+
logger.info(f'Starting request: <{params}>')
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
async def on_request_chunk_sent(session, context, params):
|
|
58
|
-
logger.
|
|
58
|
+
logger.info(f'Request body: {params}')
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
async def on_response_chunk_received(session, context, params):
|
|
62
|
-
logger.
|
|
62
|
+
logger.info(f'Response info: <{params}>')
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
class AioHttpClient:
|
|
@@ -116,7 +116,7 @@ class AioHttpClient:
|
|
|
116
116
|
line = line.decode("utf8")
|
|
117
117
|
line = line.rstrip("\n").rstrip("\r")
|
|
118
118
|
if self.debug:
|
|
119
|
-
logger.
|
|
119
|
+
logger.info(line)
|
|
120
120
|
sse_msg = ServerSentEvent.decode(line)
|
|
121
121
|
if not sse_msg:
|
|
122
122
|
continue
|
|
@@ -567,7 +567,7 @@ async def send_requests_worker(task_id, request_queue: asyncio.Queue, benchmark_
|
|
|
567
567
|
else:
|
|
568
568
|
if response_data:
|
|
569
569
|
collected_messages.append(response_data) # save the message
|
|
570
|
-
logger.
|
|
570
|
+
logger.info(response_data)
|
|
571
571
|
benchmark_data["chunk_times"].append(time.perf_counter())
|
|
572
572
|
|
|
573
573
|
benchmark_data["response_messages"] = collected_messages
|
evalscope/run.py
CHANGED
|
@@ -207,6 +207,10 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
|
|
|
207
207
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
208
208
|
vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
|
|
209
209
|
vlm_eval_kit_backend_manager.run()
|
|
210
|
+
elif eval_backend == EvalBackend.RAG_EVAL.value:
|
|
211
|
+
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
212
|
+
rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
|
|
213
|
+
rag_eval_backend_manager.run()
|
|
210
214
|
# TODO: Add other evaluation backends
|
|
211
215
|
elif eval_backend == EvalBackend.THIRD_PARTY.value:
|
|
212
216
|
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|