evalscope 0.5.5rc1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +0 -3
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
- evalscope/backend/rag_eval/__init__.py +4 -0
- evalscope/backend/rag_eval/backend_manager.py +80 -0
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +2 -0
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +34 -0
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +277 -0
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +119 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +83 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +247 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +170 -0
- evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
- evalscope/backend/rag_eval/cmteb/arguments.py +61 -0
- evalscope/backend/rag_eval/cmteb/base.py +91 -0
- evalscope/backend/rag_eval/cmteb/task_template.py +85 -0
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +61 -0
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +151 -0
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +70 -0
- evalscope/backend/rag_eval/ragas/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/arguments.py +47 -0
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +91 -0
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +99 -0
- evalscope/backend/rag_eval/ragas/task_template.py +61 -0
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +263 -0
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +72 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/evaluator/evaluator.py +1 -0
- evalscope/models/api/openai_api.py +2 -2
- evalscope/perf/http_client.py +1 -1
- evalscope/perf/openai_api.py +2 -0
- evalscope/run.py +4 -0
- evalscope/utils/logger.py +44 -14
- evalscope/utils/task_utils.py +3 -0
- evalscope/version.py +2 -2
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/METADATA +95 -99
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/RECORD +48 -17
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/WHEEL +1 -1
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
from ragas.llms import LangchainLLMWrapper
|
|
6
|
+
from ragas.embeddings import LangchainEmbeddingsWrapper
|
|
7
|
+
from .translate_prompt import translate_prompts
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
9
|
+
from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
|
|
10
|
+
from evalscope.backend.rag_eval import EmbeddingModel, LLM, ChatOpenAI
|
|
11
|
+
|
|
12
|
+
os.environ['DO_NOT_TRACK'] = 'true'
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_transform(llm, embedding, language):
|
|
18
|
+
"""
|
|
19
|
+
Creates and returns a default set of transforms for processing a knowledge graph.
|
|
20
|
+
|
|
21
|
+
This function defines a series of transformation steps to be applied to a
|
|
22
|
+
knowledge graph, including extracting summaries, keyphrases, titles,
|
|
23
|
+
headlines, and embeddings, as well as building similarity relationships
|
|
24
|
+
between nodes.
|
|
25
|
+
|
|
26
|
+
The transforms are applied in the following order:
|
|
27
|
+
1. Parallel extraction of summaries and headlines
|
|
28
|
+
2. Embedding of summaries for document nodes
|
|
29
|
+
3. Splitting of headlines
|
|
30
|
+
4. Parallel extraction of embeddings, keyphrases, and titles
|
|
31
|
+
5. Building cosine similarity relationships between nodes
|
|
32
|
+
6. Building cosine similarity relationships between summaries
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
Transforms
|
|
37
|
+
A list of transformation steps to be applied to the knowledge graph.
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
from ragas.testset.transforms.engine import Parallel
|
|
41
|
+
from ragas.testset.transforms.extractors import (
|
|
42
|
+
EmbeddingExtractor,
|
|
43
|
+
HeadlinesExtractor,
|
|
44
|
+
KeyphrasesExtractor,
|
|
45
|
+
SummaryExtractor,
|
|
46
|
+
TitleExtractor,
|
|
47
|
+
)
|
|
48
|
+
from ragas.testset.transforms.relationship_builders.cosine import (
|
|
49
|
+
CosineSimilarityBuilder,
|
|
50
|
+
SummaryCosineSimilarityBuilder,
|
|
51
|
+
)
|
|
52
|
+
from ragas.testset.transforms.splitters import HeadlineSplitter
|
|
53
|
+
from ragas.testset.graph import NodeType
|
|
54
|
+
|
|
55
|
+
# define the transforms
|
|
56
|
+
summary_extractor = SummaryExtractor(llm=llm)
|
|
57
|
+
keyphrase_extractor = KeyphrasesExtractor(llm=llm)
|
|
58
|
+
title_extractor = TitleExtractor(llm=llm)
|
|
59
|
+
headline_extractor = HeadlinesExtractor(llm=llm)
|
|
60
|
+
|
|
61
|
+
asyncio.run(
|
|
62
|
+
translate_prompts(
|
|
63
|
+
prompts=[
|
|
64
|
+
summary_extractor,
|
|
65
|
+
keyphrase_extractor,
|
|
66
|
+
title_extractor,
|
|
67
|
+
headline_extractor,
|
|
68
|
+
],
|
|
69
|
+
target_lang=language,
|
|
70
|
+
llm=llm,
|
|
71
|
+
adapt_instruction=True,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
embedding_extractor = EmbeddingExtractor(embedding_model=embedding)
|
|
76
|
+
headline_splitter = HeadlineSplitter()
|
|
77
|
+
cosine_sim_builder = CosineSimilarityBuilder(threshold=0.8)
|
|
78
|
+
summary_embedder = EmbeddingExtractor(
|
|
79
|
+
name='summary_embedder',
|
|
80
|
+
filter_nodes=lambda node: True if node.type == NodeType.DOCUMENT else False,
|
|
81
|
+
property_name='summary_embedding',
|
|
82
|
+
embed_property_name='summary',
|
|
83
|
+
embedding_model=embedding,
|
|
84
|
+
)
|
|
85
|
+
summary_cosine_sim_builder = SummaryCosineSimilarityBuilder(threshold=0.6)
|
|
86
|
+
|
|
87
|
+
# specify the transforms and their order to be applied
|
|
88
|
+
transforms = [
|
|
89
|
+
Parallel(summary_extractor, headline_extractor),
|
|
90
|
+
summary_embedder,
|
|
91
|
+
headline_splitter,
|
|
92
|
+
Parallel(embedding_extractor, keyphrase_extractor, title_extractor),
|
|
93
|
+
cosine_sim_builder,
|
|
94
|
+
summary_cosine_sim_builder,
|
|
95
|
+
]
|
|
96
|
+
return transforms
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_distribution(llm, distribution, language):
|
|
100
|
+
from ragas.testset.synthesizers.abstract_query import (
|
|
101
|
+
AbstractQuerySynthesizer,
|
|
102
|
+
ComparativeAbstractQuerySynthesizer,
|
|
103
|
+
)
|
|
104
|
+
from ragas.testset.synthesizers.specific_query import SpecificQuerySynthesizer
|
|
105
|
+
|
|
106
|
+
abstract = AbstractQuerySynthesizer(llm=llm)
|
|
107
|
+
comparative = ComparativeAbstractQuerySynthesizer(llm=llm)
|
|
108
|
+
specific = SpecificQuerySynthesizer(llm=llm)
|
|
109
|
+
|
|
110
|
+
asyncio.run(
|
|
111
|
+
translate_prompts(
|
|
112
|
+
prompts=[
|
|
113
|
+
abstract,
|
|
114
|
+
comparative,
|
|
115
|
+
specific,
|
|
116
|
+
],
|
|
117
|
+
target_lang=language,
|
|
118
|
+
llm=llm,
|
|
119
|
+
adapt_instruction=True,
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
return [
|
|
123
|
+
(abstract, distribution['simple']),
|
|
124
|
+
(comparative, distribution['multi_context']),
|
|
125
|
+
(specific, distribution['reasoning']),
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def get_knowledge_graph(documents, transforms, local_file):
|
|
130
|
+
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
|
|
131
|
+
from ragas.testset.transforms import apply_transforms
|
|
132
|
+
|
|
133
|
+
if os.path.exists(local_file):
|
|
134
|
+
logger.info(f'Loading knowledge graph from {local_file}')
|
|
135
|
+
return KnowledgeGraph.load(local_file)
|
|
136
|
+
# convert the documents to Ragas nodes
|
|
137
|
+
nodes = []
|
|
138
|
+
for doc in documents:
|
|
139
|
+
node = Node(
|
|
140
|
+
type=NodeType.DOCUMENT,
|
|
141
|
+
properties={
|
|
142
|
+
'page_content': doc.page_content,
|
|
143
|
+
'document_metadata': doc.metadata,
|
|
144
|
+
},
|
|
145
|
+
)
|
|
146
|
+
nodes.append(node)
|
|
147
|
+
|
|
148
|
+
kg = KnowledgeGraph(nodes=nodes)
|
|
149
|
+
|
|
150
|
+
# apply transforms and update the knowledge graph
|
|
151
|
+
apply_transforms(kg, transforms)
|
|
152
|
+
|
|
153
|
+
# save the knowledge graph
|
|
154
|
+
output_path = os.path.dirname(local_file)
|
|
155
|
+
os.makedirs(output_path, exist_ok=True)
|
|
156
|
+
kg.save(local_file)
|
|
157
|
+
logger.info(f'Knowledge graph saved to {local_file}')
|
|
158
|
+
return kg
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def load_data(file_path):
|
|
162
|
+
from langchain_community.document_loaders import UnstructuredFileLoader
|
|
163
|
+
|
|
164
|
+
loader = UnstructuredFileLoader(file_path, mode='elements')
|
|
165
|
+
data = loader.load()
|
|
166
|
+
return data
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
170
|
+
|
|
171
|
+
from ragas.testset import TestsetGenerator
|
|
172
|
+
from ragas import RunConfig
|
|
173
|
+
|
|
174
|
+
# load data
|
|
175
|
+
documents = load_data(args.docs)
|
|
176
|
+
|
|
177
|
+
# generator with models
|
|
178
|
+
generator_llm = LLM.load(**args.generator_llm)
|
|
179
|
+
embeddings = EmbeddingModel.load(**args.embeddings)
|
|
180
|
+
|
|
181
|
+
# Change resulting question type distribution
|
|
182
|
+
distributions = get_distribution(
|
|
183
|
+
LangchainLLMWrapper(generator_llm), args.distribution, args.language
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# get transforms
|
|
187
|
+
transforms = get_transform(
|
|
188
|
+
LangchainLLMWrapper(generator_llm),
|
|
189
|
+
LangchainEmbeddingsWrapper(embeddings),
|
|
190
|
+
args.language,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# get knowledge graph
|
|
194
|
+
knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph)
|
|
195
|
+
|
|
196
|
+
generator = TestsetGenerator.from_langchain(
|
|
197
|
+
generator_llm, embeddings, knowledge_graph
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
runconfig = RunConfig(
|
|
201
|
+
timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True
|
|
202
|
+
)
|
|
203
|
+
testset = generator.generate(
|
|
204
|
+
testset_size=args.test_size,
|
|
205
|
+
query_distribution=distributions,
|
|
206
|
+
run_config=runconfig,
|
|
207
|
+
with_debugging_logs=True,
|
|
208
|
+
raise_exceptions=True,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# save file
|
|
212
|
+
testset_df = testset.to_pandas()
|
|
213
|
+
output_path = os.path.dirname(args.output_file)
|
|
214
|
+
os.makedirs(output_path, exist_ok=True)
|
|
215
|
+
testset_df.to_json(
|
|
216
|
+
args.output_file, indent=4, index=False, orient='records', force_ascii=False
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# get answer
|
|
220
|
+
testset_with_answer = get_answer(testset_df, generator_llm, args.language)
|
|
221
|
+
testset_with_answer.to_json(
|
|
222
|
+
args.output_file.replace('.json', '_with_answer.json'),
|
|
223
|
+
indent=4,
|
|
224
|
+
index=False,
|
|
225
|
+
orient='records',
|
|
226
|
+
force_ascii=False,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def get_answer(testset_df, generator_llm, language: None):
|
|
231
|
+
template = """You are an assistant for question-answering tasks.
|
|
232
|
+
Use the following pieces of retrieved context to answer the question.
|
|
233
|
+
If you don't know the answer, just say that you don't know. Answer in {language}.
|
|
234
|
+
Question: {question}
|
|
235
|
+
Context: {contexts}
|
|
236
|
+
Answer:
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
items = []
|
|
240
|
+
for i in tqdm(range(len(testset_df)), desc='Generating Answers'):
|
|
241
|
+
row = testset_df.iloc[i]
|
|
242
|
+
question = row['user_input']
|
|
243
|
+
contexts = '\n'.join(row['reference_contexts'])
|
|
244
|
+
|
|
245
|
+
# Combine question and contexts as input for the LLM
|
|
246
|
+
input_text = template.format(
|
|
247
|
+
language=language, question=question, contexts=contexts
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Generate the answer using the generator LLM
|
|
251
|
+
answer = generator_llm.invoke(input_text)
|
|
252
|
+
if isinstance(generator_llm, ChatOpenAI):
|
|
253
|
+
answer = answer.content
|
|
254
|
+
items.append(
|
|
255
|
+
{
|
|
256
|
+
'user_input': question,
|
|
257
|
+
'retrieved_contexts': row['reference_contexts'],
|
|
258
|
+
'response': answer,
|
|
259
|
+
'reference': row['reference'],
|
|
260
|
+
}
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
return pd.DataFrame.from_dict(items)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
from typing import List
|
|
4
|
+
from ragas.prompt import PromptMixin
|
|
5
|
+
from ragas.llms import BaseRagasLLM
|
|
6
|
+
from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
async def translate_prompt(
|
|
14
|
+
prompt_user: PromptMixin,
|
|
15
|
+
target_lang: str,
|
|
16
|
+
llm: BaseRagasLLM,
|
|
17
|
+
adapt_instruction: bool = False,
|
|
18
|
+
):
|
|
19
|
+
if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
|
|
20
|
+
logger.warning(
|
|
21
|
+
f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}'
|
|
22
|
+
)
|
|
23
|
+
return
|
|
24
|
+
|
|
25
|
+
if not issubclass(type(prompt_user), PromptMixin):
|
|
26
|
+
logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
class_name = prompt_user.__class__.__name__
|
|
30
|
+
current_dir = os.path.dirname(__file__)
|
|
31
|
+
prompt_dir = os.path.abspath(
|
|
32
|
+
os.path.join(current_dir, f'../prompts/{target_lang}/{class_name}')
|
|
33
|
+
)
|
|
34
|
+
os.makedirs(prompt_dir, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
loader_prompts = prompt_user.load_prompts(prompt_dir, target_lang)
|
|
38
|
+
prompt_user.set_prompts(**loader_prompts)
|
|
39
|
+
logger.info(f'Load existing prompts from {prompt_dir}')
|
|
40
|
+
return
|
|
41
|
+
except FileNotFoundError:
|
|
42
|
+
logger.info(f'Not find existing prompts {class_name}, generate new prompts.')
|
|
43
|
+
|
|
44
|
+
logger.info(f'Translating prompts to {target_lang}')
|
|
45
|
+
adapted_prompts = await prompt_user.adapt_prompts(
|
|
46
|
+
language=target_lang, llm=llm, adapt_instruction=adapt_instruction
|
|
47
|
+
)
|
|
48
|
+
prompt_user.set_prompts(**adapted_prompts)
|
|
49
|
+
try:
|
|
50
|
+
prompt_user.save_prompts(prompt_dir)
|
|
51
|
+
except FileExistsError:
|
|
52
|
+
logger.info(f'Find existing prompt {class_name}, skip saving.')
|
|
53
|
+
logger.info(f'Save new prompts to {prompt_dir}')
|
|
54
|
+
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def translate_prompts(
|
|
59
|
+
prompts: List[PromptMixin],
|
|
60
|
+
target_lang: str,
|
|
61
|
+
llm: BaseRagasLLM,
|
|
62
|
+
adapt_instruction: bool = False,
|
|
63
|
+
):
|
|
64
|
+
if target_lang and target_lang != 'english':
|
|
65
|
+
await asyncio.gather(
|
|
66
|
+
*[
|
|
67
|
+
translate_prompt(prompt, target_lang, llm, adapt_instruction)
|
|
68
|
+
for prompt in prompts
|
|
69
|
+
]
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
logger.info('Translate prompts finished')
|
|
@@ -8,7 +8,7 @@ class CustomDataset:
|
|
|
8
8
|
|
|
9
9
|
def load_data(self, dataset):
|
|
10
10
|
# customize the loading of the dataset
|
|
11
|
-
data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
|
|
11
|
+
data_path = os.path.join(os.path.expanduser("~/LMUData"), f'{dataset}.tsv')
|
|
12
12
|
return load(data_path)
|
|
13
13
|
|
|
14
14
|
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -174,6 +174,7 @@ class Evaluator(object):
|
|
|
174
174
|
"""
|
|
175
175
|
assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
|
|
176
176
|
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
177
|
+
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
177
178
|
|
|
178
179
|
answers_list = []
|
|
179
180
|
pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
|
|
@@ -76,12 +76,12 @@ class OpenaiApi:
|
|
|
76
76
|
data = json.dumps(data, ensure_ascii=False)
|
|
77
77
|
|
|
78
78
|
if self.verbose:
|
|
79
|
-
|
|
79
|
+
logger.info(f'>>data in generate_simple: {data}')
|
|
80
80
|
|
|
81
81
|
resp = requests.post(self.url, headers=header, data=data)
|
|
82
82
|
resp = resp.json()
|
|
83
83
|
if self.verbose:
|
|
84
|
-
|
|
84
|
+
logger.info(f'>>resp in generate_simple: {resp}')
|
|
85
85
|
|
|
86
86
|
if self.logprobs:
|
|
87
87
|
return resp['choices']
|
evalscope/perf/http_client.py
CHANGED
|
@@ -209,7 +209,7 @@ async def dispatch_requests_worker(request_queue: asyncio.Queue, args):
|
|
|
209
209
|
prompt = f.read()
|
|
210
210
|
else:
|
|
211
211
|
prompt = args.prompt
|
|
212
|
-
messages = {'role': 'user', 'content': prompt}
|
|
212
|
+
messages = [{'role': 'user', 'content': prompt}]
|
|
213
213
|
request = query_generator.build_request(messages, query_parameters)
|
|
214
214
|
if args.number is None:
|
|
215
215
|
await request_queue.put(request)
|
evalscope/perf/openai_api.py
CHANGED
|
@@ -39,6 +39,8 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
39
39
|
try:
|
|
40
40
|
if param.query_template is not None:
|
|
41
41
|
query = json.loads(param.query_template)
|
|
42
|
+
if 'stream' in query.keys():
|
|
43
|
+
param.stream = query['stream']
|
|
42
44
|
query['messages'] = messages # replace template messages with input messages.
|
|
43
45
|
return self.__compose_query_from_parameter(query, param)
|
|
44
46
|
else:
|
evalscope/run.py
CHANGED
|
@@ -207,6 +207,10 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
|
|
|
207
207
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
208
208
|
vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
|
|
209
209
|
vlm_eval_kit_backend_manager.run()
|
|
210
|
+
elif eval_backend == EvalBackend.RAG_EVAL.value:
|
|
211
|
+
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
212
|
+
rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
|
|
213
|
+
rag_eval_backend_manager.run()
|
|
210
214
|
# TODO: Add other evaluation backends
|
|
211
215
|
elif eval_backend == EvalBackend.THIRD_PARTY.value:
|
|
212
216
|
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
evalscope/utils/logger.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import importlib.util as iutil
|
|
3
4
|
import logging
|
|
4
5
|
from typing import Optional
|
|
5
6
|
|
|
6
7
|
init_loggers = {}
|
|
8
|
+
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
9
|
+
formatter = logging.Formatter(format)
|
|
7
10
|
|
|
8
|
-
|
|
9
|
-
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
11
|
+
logging.basicConfig(format=format, level=logging.INFO)
|
|
10
12
|
|
|
11
13
|
|
|
12
|
-
def get_logger(
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
"""
|
|
14
|
+
def get_logger(
|
|
15
|
+
log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = "w"
|
|
16
|
+
):
|
|
17
|
+
"""Get logging logger
|
|
16
18
|
|
|
17
19
|
Args:
|
|
18
20
|
log_file: Log filename, if specified, file handler will be added to
|
|
@@ -22,21 +24,39 @@ def get_logger(log_file: Optional[str] = None,
|
|
|
22
24
|
specified (if filemode is unspecified, it defaults to 'w').
|
|
23
25
|
"""
|
|
24
26
|
|
|
25
|
-
logger_name = __name__.split(
|
|
27
|
+
logger_name = __name__.split(".")[0]
|
|
26
28
|
logger = logging.getLogger(logger_name)
|
|
27
|
-
|
|
29
|
+
logger.propagate = False
|
|
28
30
|
if logger_name in init_loggers:
|
|
29
31
|
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
32
|
+
if logger.level != log_level:
|
|
33
|
+
logger.setLevel(log_level)
|
|
30
34
|
return logger
|
|
31
35
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
36
|
+
# handle duplicate logs to the console
|
|
37
|
+
# Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
|
|
38
|
+
# to the root logger. As logger.propagate is True by default, this root
|
|
39
|
+
# level handler causes logging messages from rank>0 processes to
|
|
40
|
+
# unexpectedly show up on the console, creating much unwanted clutter.
|
|
41
|
+
# To fix this issue, we set the root logger's StreamHandler, if any, to log
|
|
42
|
+
# at the ERROR level.
|
|
43
|
+
torch_dist = False
|
|
44
|
+
is_worker0 = True
|
|
45
|
+
if iutil.find_spec("torch") is not None:
|
|
46
|
+
from modelscope.utils.torch_utils import is_dist, is_master
|
|
47
|
+
|
|
48
|
+
torch_dist = is_dist()
|
|
49
|
+
is_worker0 = is_master()
|
|
50
|
+
|
|
51
|
+
if torch_dist:
|
|
52
|
+
for handler in logger.root.handlers:
|
|
53
|
+
if type(handler) is logging.StreamHandler:
|
|
54
|
+
handler.setLevel(logging.ERROR)
|
|
35
55
|
|
|
36
56
|
stream_handler = logging.StreamHandler()
|
|
37
57
|
handlers = [stream_handler]
|
|
38
58
|
|
|
39
|
-
if log_file is not None:
|
|
59
|
+
if is_worker0 and log_file is not None:
|
|
40
60
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
41
61
|
handlers.append(file_handler)
|
|
42
62
|
|
|
@@ -45,7 +65,10 @@ def get_logger(log_file: Optional[str] = None,
|
|
|
45
65
|
handler.setLevel(log_level)
|
|
46
66
|
logger.addHandler(handler)
|
|
47
67
|
|
|
48
|
-
|
|
68
|
+
if is_worker0:
|
|
69
|
+
logger.setLevel(log_level)
|
|
70
|
+
else:
|
|
71
|
+
logger.setLevel(logging.ERROR)
|
|
49
72
|
|
|
50
73
|
init_loggers[logger_name] = True
|
|
51
74
|
|
|
@@ -57,7 +80,14 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
|
|
|
57
80
|
if isinstance(handler, logging.FileHandler):
|
|
58
81
|
return
|
|
59
82
|
|
|
60
|
-
if
|
|
83
|
+
if iutil.find_spec("torch") is not None:
|
|
84
|
+
from modelscope.utils.torch_utils import is_master
|
|
85
|
+
|
|
86
|
+
is_worker0 = is_master()
|
|
87
|
+
else:
|
|
88
|
+
is_worker0 = True
|
|
89
|
+
|
|
90
|
+
if is_worker0 and log_file is not None:
|
|
61
91
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
62
92
|
file_handler.setFormatter(formatter)
|
|
63
93
|
file_handler.setLevel(log_level)
|
evalscope/utils/task_utils.py
CHANGED
|
@@ -11,6 +11,9 @@ class EvalBackend(Enum):
|
|
|
11
11
|
|
|
12
12
|
# Use VLM Eval Kit as the multi-modal model evaluation backend
|
|
13
13
|
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
14
|
+
|
|
15
|
+
# Use RAGEval as the RAG evaluation backend
|
|
16
|
+
RAG_EVAL = 'RAGEval'
|
|
14
17
|
|
|
15
18
|
# Use third-party evaluation backend/modules
|
|
16
19
|
THIRD_PARTY = 'ThirdParty'
|
evalscope/version.py
CHANGED