evalscope 0.6.0rc0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
- evalscope/backend/rag_eval/utils/__init__.py +0 -0
- evalscope/backend/rag_eval/utils/clip.py +149 -0
- evalscope/backend/rag_eval/utils/embedding.py +183 -0
- evalscope/backend/rag_eval/utils/llm.py +72 -0
- evalscope/backend/rag_eval/utils/tools.py +63 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.6.0rc0.dist-info → evalscope-0.6.1.dist-info}/METADATA +78 -77
- {evalscope-0.6.0rc0.dist-info → evalscope-0.6.1.dist-info}/RECORD +14 -9
- {evalscope-0.6.0rc0.dist-info → evalscope-0.6.1.dist-info}/WHEEL +1 -1
- {evalscope-0.6.0rc0.dist-info → evalscope-0.6.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.6.0rc0.dist-info → evalscope-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -51,12 +51,12 @@ with read_base():
|
|
|
51
51
|
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
|
52
52
|
from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
|
|
53
53
|
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
|
54
|
-
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
|
|
55
54
|
|
|
56
55
|
# Note: to be supported
|
|
57
56
|
# from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
|
|
58
57
|
# from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
|
|
59
58
|
# from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
|
|
59
|
+
# from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
datasets = []
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
|
|
3
4
|
import pandas as pd
|
|
4
|
-
from tqdm import tqdm
|
|
5
|
-
from ragas.llms import LangchainLLMWrapper
|
|
6
5
|
from ragas.embeddings import LangchainEmbeddingsWrapper
|
|
7
|
-
from .
|
|
8
|
-
from
|
|
9
|
-
from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
|
|
10
|
-
from evalscope.backend.rag_eval import EmbeddingModel, LLM, ChatOpenAI
|
|
6
|
+
from ragas.llms import LangchainLLMWrapper
|
|
7
|
+
from tqdm import tqdm
|
|
11
8
|
|
|
12
|
-
|
|
9
|
+
from evalscope.backend.rag_eval import LLM, ChatOpenAI, EmbeddingModel
|
|
10
|
+
from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from .translate_prompt import translate_prompts
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
15
15
|
|
|
@@ -17,116 +17,110 @@ logger = get_logger()
|
|
|
17
17
|
def get_transform(llm, embedding, language):
|
|
18
18
|
"""
|
|
19
19
|
Creates and returns a default set of transforms for processing a knowledge graph.
|
|
20
|
-
|
|
21
|
-
This function defines a series of transformation steps to be applied to a
|
|
22
|
-
knowledge graph, including extracting summaries, keyphrases, titles,
|
|
23
|
-
headlines, and embeddings, as well as building similarity relationships
|
|
24
|
-
between nodes.
|
|
25
|
-
|
|
26
|
-
The transforms are applied in the following order:
|
|
27
|
-
1. Parallel extraction of summaries and headlines
|
|
28
|
-
2. Embedding of summaries for document nodes
|
|
29
|
-
3. Splitting of headlines
|
|
30
|
-
4. Parallel extraction of embeddings, keyphrases, and titles
|
|
31
|
-
5. Building cosine similarity relationships between nodes
|
|
32
|
-
6. Building cosine similarity relationships between summaries
|
|
33
|
-
|
|
34
|
-
Returns
|
|
35
|
-
-------
|
|
36
|
-
Transforms
|
|
37
|
-
A list of transformation steps to be applied to the knowledge graph.
|
|
38
|
-
|
|
39
20
|
"""
|
|
40
21
|
from ragas.testset.transforms.engine import Parallel
|
|
41
22
|
from ragas.testset.transforms.extractors import (
|
|
42
23
|
EmbeddingExtractor,
|
|
43
24
|
HeadlinesExtractor,
|
|
44
|
-
KeyphrasesExtractor,
|
|
45
25
|
SummaryExtractor,
|
|
46
|
-
TitleExtractor,
|
|
47
26
|
)
|
|
48
|
-
from ragas.testset.transforms.
|
|
27
|
+
from ragas.testset.transforms.extractors.llm_based import NERExtractor, ThemesExtractor
|
|
28
|
+
from ragas.testset.transforms.relationship_builders import (
|
|
49
29
|
CosineSimilarityBuilder,
|
|
50
|
-
|
|
30
|
+
OverlapScoreBuilder,
|
|
51
31
|
)
|
|
52
32
|
from ragas.testset.transforms.splitters import HeadlineSplitter
|
|
33
|
+
from ragas.testset.transforms.filters import CustomNodeFilter
|
|
53
34
|
from ragas.testset.graph import NodeType
|
|
35
|
+
from ragas.utils import num_tokens_from_string
|
|
36
|
+
|
|
37
|
+
def summary_filter(node):
|
|
38
|
+
return (node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > 500)
|
|
54
39
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
title_extractor = TitleExtractor(llm=llm)
|
|
40
|
+
summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: summary_filter(node))
|
|
41
|
+
ner_extractor = NERExtractor(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
42
|
+
theme_extractor = ThemesExtractor(llm=llm)
|
|
59
43
|
headline_extractor = HeadlinesExtractor(llm=llm)
|
|
60
44
|
|
|
61
45
|
asyncio.run(
|
|
62
46
|
translate_prompts(
|
|
63
47
|
prompts=[
|
|
64
48
|
summary_extractor,
|
|
65
|
-
|
|
66
|
-
|
|
49
|
+
theme_extractor,
|
|
50
|
+
ner_extractor,
|
|
67
51
|
headline_extractor,
|
|
68
52
|
],
|
|
69
53
|
target_lang=language,
|
|
70
54
|
llm=llm,
|
|
71
55
|
adapt_instruction=True,
|
|
72
|
-
)
|
|
73
|
-
|
|
56
|
+
))
|
|
57
|
+
|
|
58
|
+
splitter = HeadlineSplitter(min_tokens=500)
|
|
74
59
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
cosine_sim_builder = CosineSimilarityBuilder(threshold=0.8)
|
|
78
|
-
summary_embedder = EmbeddingExtractor(
|
|
79
|
-
name='summary_embedder',
|
|
80
|
-
filter_nodes=lambda node: True if node.type == NodeType.DOCUMENT else False,
|
|
60
|
+
summary_emb_extractor = EmbeddingExtractor(
|
|
61
|
+
embedding_model=embedding,
|
|
81
62
|
property_name='summary_embedding',
|
|
82
63
|
embed_property_name='summary',
|
|
83
|
-
|
|
64
|
+
filter_nodes=lambda node: summary_filter(node),
|
|
84
65
|
)
|
|
85
|
-
summary_cosine_sim_builder = SummaryCosineSimilarityBuilder(threshold=0.6)
|
|
86
66
|
|
|
87
|
-
|
|
67
|
+
cosine_sim_builder = CosineSimilarityBuilder(
|
|
68
|
+
property_name='summary_embedding',
|
|
69
|
+
new_property_name='summary_similarity',
|
|
70
|
+
threshold=0.7,
|
|
71
|
+
filter_nodes=lambda node: summary_filter(node),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
ner_overlap_sim = OverlapScoreBuilder(threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
75
|
+
|
|
76
|
+
node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
77
|
+
|
|
88
78
|
transforms = [
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
79
|
+
headline_extractor,
|
|
80
|
+
splitter,
|
|
81
|
+
summary_extractor,
|
|
82
|
+
node_filter,
|
|
83
|
+
Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
|
|
84
|
+
Parallel(cosine_sim_builder, ner_overlap_sim),
|
|
95
85
|
]
|
|
86
|
+
|
|
96
87
|
return transforms
|
|
97
88
|
|
|
98
89
|
|
|
99
90
|
def get_distribution(llm, distribution, language):
|
|
100
|
-
from ragas.testset.synthesizers.
|
|
101
|
-
|
|
102
|
-
|
|
91
|
+
from ragas.testset.synthesizers.multi_hop import (
|
|
92
|
+
MultiHopAbstractQuerySynthesizer,
|
|
93
|
+
MultiHopSpecificQuerySynthesizer,
|
|
103
94
|
)
|
|
104
|
-
from ragas.testset.synthesizers.
|
|
95
|
+
from ragas.testset.synthesizers.single_hop.specific import (
|
|
96
|
+
SingleHopSpecificQuerySynthesizer, )
|
|
105
97
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
98
|
+
single_hop = SingleHopSpecificQuerySynthesizer(llm=llm)
|
|
99
|
+
multi_hop_abs = MultiHopAbstractQuerySynthesizer(llm=llm)
|
|
100
|
+
multi_hop_spec = MultiHopSpecificQuerySynthesizer(llm=llm)
|
|
109
101
|
|
|
110
102
|
asyncio.run(
|
|
111
103
|
translate_prompts(
|
|
112
104
|
prompts=[
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
105
|
+
single_hop,
|
|
106
|
+
multi_hop_abs,
|
|
107
|
+
multi_hop_spec,
|
|
116
108
|
],
|
|
117
109
|
target_lang=language,
|
|
118
110
|
llm=llm,
|
|
119
111
|
adapt_instruction=True,
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
112
|
+
))
|
|
113
|
+
|
|
114
|
+
mapping = {
|
|
115
|
+
'simple': single_hop,
|
|
116
|
+
'multi_context': multi_hop_abs,
|
|
117
|
+
'reasoning': multi_hop_spec,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return [(mapping[key], distribution[key]) for key in mapping if key in distribution]
|
|
127
121
|
|
|
128
122
|
|
|
129
|
-
def get_knowledge_graph(documents, transforms, local_file):
|
|
123
|
+
def get_knowledge_graph(documents, transforms, local_file, run_config):
|
|
130
124
|
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
|
|
131
125
|
from ragas.testset.transforms import apply_transforms
|
|
132
126
|
|
|
@@ -148,7 +142,7 @@ def get_knowledge_graph(documents, transforms, local_file):
|
|
|
148
142
|
kg = KnowledgeGraph(nodes=nodes)
|
|
149
143
|
|
|
150
144
|
# apply transforms and update the knowledge graph
|
|
151
|
-
apply_transforms(kg, transforms)
|
|
145
|
+
apply_transforms(kg, transforms, run_config=run_config)
|
|
152
146
|
|
|
153
147
|
# save the knowledge graph
|
|
154
148
|
output_path = os.path.dirname(local_file)
|
|
@@ -158,6 +152,39 @@ def get_knowledge_graph(documents, transforms, local_file):
|
|
|
158
152
|
return kg
|
|
159
153
|
|
|
160
154
|
|
|
155
|
+
def get_persona(llm, kg, language):
|
|
156
|
+
from evalscope.backend.rag_eval.ragas.prompts.persona_prompt import PersonaGenerationPromptZH
|
|
157
|
+
from ragas.testset.persona import generate_personas_from_kg, PersonaGenerationPrompt
|
|
158
|
+
from ragas.testset.graph import Node
|
|
159
|
+
|
|
160
|
+
def filter(node: Node) -> bool:
|
|
161
|
+
if (node.type.name == 'DOCUMENT' and node.properties.get('summary_embedding') is not None):
|
|
162
|
+
return True
|
|
163
|
+
else:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
if language == 'chinese':
|
|
167
|
+
persona_prompt = PersonaGenerationPromptZH()
|
|
168
|
+
else:
|
|
169
|
+
persona_prompt = PersonaGenerationPrompt()
|
|
170
|
+
# NOTE: can't translate this yet
|
|
171
|
+
# asyncio.run(
|
|
172
|
+
# translate_prompts(
|
|
173
|
+
# prompts=[persona_prompt],
|
|
174
|
+
# target_lang=language,
|
|
175
|
+
# llm=llm,
|
|
176
|
+
# adapt_instruction=True,
|
|
177
|
+
# ))
|
|
178
|
+
|
|
179
|
+
return generate_personas_from_kg(
|
|
180
|
+
llm=llm,
|
|
181
|
+
kg=kg,
|
|
182
|
+
num_personas=3,
|
|
183
|
+
persona_generation_prompt=persona_prompt,
|
|
184
|
+
filter_fn=filter,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
161
188
|
def load_data(file_path):
|
|
162
189
|
from langchain_community.document_loaders import UnstructuredFileLoader
|
|
163
190
|
|
|
@@ -178,32 +205,31 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
|
178
205
|
generator_llm = LLM.load(**args.generator_llm)
|
|
179
206
|
embeddings = EmbeddingModel.load(**args.embeddings)
|
|
180
207
|
|
|
208
|
+
wrapped_llm = LangchainLLMWrapper(generator_llm)
|
|
209
|
+
wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)
|
|
210
|
+
|
|
181
211
|
# Change resulting question type distribution
|
|
182
|
-
distributions = get_distribution(
|
|
183
|
-
LangchainLLMWrapper(generator_llm), args.distribution, args.language
|
|
184
|
-
)
|
|
212
|
+
distributions = get_distribution(wrapped_llm, args.distribution, args.language)
|
|
185
213
|
|
|
214
|
+
run_config = RunConfig(timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True)
|
|
186
215
|
# get transforms
|
|
187
216
|
transforms = get_transform(
|
|
188
|
-
|
|
189
|
-
|
|
217
|
+
wrapped_llm,
|
|
218
|
+
wrapped_embeddings,
|
|
190
219
|
args.language,
|
|
191
220
|
)
|
|
192
221
|
|
|
193
222
|
# get knowledge graph
|
|
194
|
-
knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph)
|
|
223
|
+
knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph, run_config)
|
|
195
224
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
)
|
|
225
|
+
persona_list = get_persona(llm=wrapped_llm, kg=knowledge_graph, language=args.language)
|
|
226
|
+
|
|
227
|
+
generator = TestsetGenerator(llm=wrapped_llm, knowledge_graph=knowledge_graph, persona_list=persona_list)
|
|
199
228
|
|
|
200
|
-
runconfig = RunConfig(
|
|
201
|
-
timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True
|
|
202
|
-
)
|
|
203
229
|
testset = generator.generate(
|
|
204
230
|
testset_size=args.test_size,
|
|
205
231
|
query_distribution=distributions,
|
|
206
|
-
run_config=
|
|
232
|
+
run_config=run_config,
|
|
207
233
|
with_debugging_logs=True,
|
|
208
234
|
raise_exceptions=True,
|
|
209
235
|
)
|
|
@@ -212,9 +238,7 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
|
212
238
|
testset_df = testset.to_pandas()
|
|
213
239
|
output_path = os.path.dirname(args.output_file)
|
|
214
240
|
os.makedirs(output_path, exist_ok=True)
|
|
215
|
-
testset_df.to_json(
|
|
216
|
-
args.output_file, indent=4, index=False, orient='records', force_ascii=False
|
|
217
|
-
)
|
|
241
|
+
testset_df.to_json(args.output_file, indent=4, index=False, orient='records', force_ascii=False)
|
|
218
242
|
|
|
219
243
|
# get answer
|
|
220
244
|
testset_with_answer = get_answer(testset_df, generator_llm, args.language)
|
|
@@ -243,21 +267,17 @@ Answer:
|
|
|
243
267
|
contexts = '\n'.join(row['reference_contexts'])
|
|
244
268
|
|
|
245
269
|
# Combine question and contexts as input for the LLM
|
|
246
|
-
input_text = template.format(
|
|
247
|
-
language=language, question=question, contexts=contexts
|
|
248
|
-
)
|
|
270
|
+
input_text = template.format(language=language, question=question, contexts=contexts)
|
|
249
271
|
|
|
250
272
|
# Generate the answer using the generator LLM
|
|
251
273
|
answer = generator_llm.invoke(input_text)
|
|
252
274
|
if isinstance(generator_llm, ChatOpenAI):
|
|
253
275
|
answer = answer.content
|
|
254
|
-
items.append(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
}
|
|
261
|
-
)
|
|
276
|
+
items.append({
|
|
277
|
+
'user_input': question,
|
|
278
|
+
'retrieved_contexts': row['reference_contexts'],
|
|
279
|
+
'response': answer,
|
|
280
|
+
'reference': row['reference'],
|
|
281
|
+
})
|
|
262
282
|
|
|
263
283
|
return pd.DataFrame.from_dict(items)
|
|
File without changes
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import torch
|
|
3
|
+
import torch.nn.functional as F
|
|
4
|
+
from typing import List
|
|
5
|
+
from PIL import Image
|
|
6
|
+
from evalscope.backend.rag_eval.utils.tools import download_model, PIL_to_base64
|
|
7
|
+
from transformers import AutoModel, AutoProcessor
|
|
8
|
+
from langchain_core.embeddings import Embeddings
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class VisionModel:
|
|
12
|
+
@staticmethod
|
|
13
|
+
def load(**kw):
|
|
14
|
+
api_base = kw.get("api_base", None)
|
|
15
|
+
if api_base:
|
|
16
|
+
|
|
17
|
+
return VLMAPI(
|
|
18
|
+
model_name=kw.get("model_name", ""),
|
|
19
|
+
openai_api_base=api_base,
|
|
20
|
+
openai_api_key=kw.get("api_key", "EMPTY"),
|
|
21
|
+
prompt=kw.get("prompt", None),
|
|
22
|
+
)
|
|
23
|
+
else:
|
|
24
|
+
return CLIPModel(**kw)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class VLMAPI:
|
|
28
|
+
def __init__(self, model_name, openai_api_base, openai_api_key, prompt=None):
|
|
29
|
+
from langchain_openai import ChatOpenAI
|
|
30
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
31
|
+
|
|
32
|
+
self.model_name = model_name
|
|
33
|
+
self.model = ChatOpenAI(
|
|
34
|
+
model_name=model_name,
|
|
35
|
+
openai_api_base=openai_api_base,
|
|
36
|
+
openai_api_key=openai_api_key,
|
|
37
|
+
)
|
|
38
|
+
self.default_prompt = "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'"
|
|
39
|
+
self.prompt = ChatPromptTemplate.from_messages(
|
|
40
|
+
[
|
|
41
|
+
("system", prompt if prompt else self.default_prompt),
|
|
42
|
+
(
|
|
43
|
+
"user",
|
|
44
|
+
[
|
|
45
|
+
{
|
|
46
|
+
"type": "image_url",
|
|
47
|
+
"image_url": {"url": "data:image/jpeg;base64,{image_data}"},
|
|
48
|
+
}
|
|
49
|
+
],
|
|
50
|
+
),
|
|
51
|
+
]
|
|
52
|
+
)
|
|
53
|
+
self.chain = self.prompt | self.model
|
|
54
|
+
self.transform = PIL_to_base64
|
|
55
|
+
|
|
56
|
+
def encode_image(self, images):
|
|
57
|
+
captions = []
|
|
58
|
+
for image in images:
|
|
59
|
+
response = self.chain.invoke({"image_data": image})
|
|
60
|
+
captions.append(response.content)
|
|
61
|
+
return captions
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class CLIPModel(Embeddings):
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
model_name: str,
|
|
68
|
+
revision: str = "master",
|
|
69
|
+
hub="modelscope",
|
|
70
|
+
device="cpu",
|
|
71
|
+
):
|
|
72
|
+
self.device = device
|
|
73
|
+
self.model_name = model_name
|
|
74
|
+
self.revision = revision
|
|
75
|
+
|
|
76
|
+
# Download the model if it doesn't exist locally
|
|
77
|
+
if not os.path.exists(model_name) and hub == "modelscope":
|
|
78
|
+
model_name = download_model(self.model_name, self.revision)
|
|
79
|
+
|
|
80
|
+
# Load the model and processor
|
|
81
|
+
self.model = AutoModel.from_pretrained(model_name).to(self.device)
|
|
82
|
+
self.processor = AutoProcessor.from_pretrained(model_name)
|
|
83
|
+
self.transform = self.processor.image_processor
|
|
84
|
+
self.tokenizer = self.processor.tokenizer
|
|
85
|
+
|
|
86
|
+
def encode_text(self, batch_texts: List[str] | List[List[str]]):
|
|
87
|
+
if isinstance(batch_texts[0], list):
|
|
88
|
+
batch_texts = [
|
|
89
|
+
text for _, texts in enumerate(batch_texts) for text in texts
|
|
90
|
+
]
|
|
91
|
+
# Ensure that the input texts are within the token limit
|
|
92
|
+
max_length = self.tokenizer.model_max_length
|
|
93
|
+
if not max_length or max_length > 0xFFFFFF:
|
|
94
|
+
max_length = 512
|
|
95
|
+
encoded_inputs = self.tokenizer(
|
|
96
|
+
text=batch_texts,
|
|
97
|
+
max_length=max_length,
|
|
98
|
+
padding=True,
|
|
99
|
+
truncation=True,
|
|
100
|
+
return_tensors="pt",
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
|
|
104
|
+
|
|
105
|
+
with torch.no_grad():
|
|
106
|
+
text_features = self.model.get_text_features(**inputs)
|
|
107
|
+
text_features = F.normalize(text_features, p=2, dim=-1)
|
|
108
|
+
return text_features
|
|
109
|
+
|
|
110
|
+
def encode_image(self, image):
|
|
111
|
+
batch_images = torch.stack([d["pixel_values"][0] for d in image])
|
|
112
|
+
batch_images = batch_images.to(self.device)
|
|
113
|
+
with torch.no_grad():
|
|
114
|
+
image_features = self.model.get_image_features(batch_images)
|
|
115
|
+
image_features = F.normalize(image_features, p=2, dim=-1)
|
|
116
|
+
return image_features
|
|
117
|
+
|
|
118
|
+
def embed_documents(self, texts):
|
|
119
|
+
text_features = self.encode_text(texts)
|
|
120
|
+
return text_features.cpu().numpy().tolist()
|
|
121
|
+
|
|
122
|
+
def embed_query(self, text):
|
|
123
|
+
text_features = self.encode_text([text])
|
|
124
|
+
return text_features.cpu().numpy().tolist()[0]
|
|
125
|
+
|
|
126
|
+
def embed_image(self, uris: List[str]):
|
|
127
|
+
# read image and transform
|
|
128
|
+
images = [Image.open(image_path) for image_path in uris]
|
|
129
|
+
transformed_images = [
|
|
130
|
+
self.transform(
|
|
131
|
+
image,
|
|
132
|
+
return_tensors="pt",
|
|
133
|
+
)
|
|
134
|
+
for image in images
|
|
135
|
+
]
|
|
136
|
+
image_features = self.encode_image(transformed_images)
|
|
137
|
+
return image_features.cpu().numpy().tolist()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
model = CLIPModel("AI-ModelScope/chinese-clip-vit-large-patch14-336px")
|
|
142
|
+
model.embed_image(
|
|
143
|
+
[
|
|
144
|
+
"custom_eval/multimodal/images/AMNH.jpg",
|
|
145
|
+
"custom_eval/multimodal/images/AMNH.jpg",
|
|
146
|
+
]
|
|
147
|
+
)
|
|
148
|
+
model.encode_text(["我喜欢吃饭" * 1000])
|
|
149
|
+
print("done")
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import torch
|
|
3
|
+
from typing import List, Optional, Union, Dict
|
|
4
|
+
from sentence_transformers import models
|
|
5
|
+
from sentence_transformers.SentenceTransformer import SentenceTransformer
|
|
6
|
+
from sentence_transformers.cross_encoder import CrossEncoder
|
|
7
|
+
from torch import Tensor
|
|
8
|
+
from evalscope.backend.rag_eval.utils.tools import download_model
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from langchain_core.embeddings import Embeddings
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseModel(Embeddings):
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
model_name_or_path: str,
|
|
19
|
+
max_seq_length: int = 512,
|
|
20
|
+
prompt: str = '',
|
|
21
|
+
revision: Optional[str] = None,
|
|
22
|
+
**kwargs,
|
|
23
|
+
):
|
|
24
|
+
self.model_name_or_path = model_name_or_path
|
|
25
|
+
self.max_seq_length = max_seq_length
|
|
26
|
+
self.model_kwargs = kwargs.pop('model_kwargs', {})
|
|
27
|
+
self.model_kwargs['trust_remote_code'] = True
|
|
28
|
+
|
|
29
|
+
self.config_kwargs = kwargs.pop('config_kwargs', {})
|
|
30
|
+
self.config_kwargs['trust_remote_code'] = True
|
|
31
|
+
|
|
32
|
+
self.encode_kwargs = kwargs.pop('encode_kwargs', {})
|
|
33
|
+
self.encode_kwargs['convert_to_tensor'] = True
|
|
34
|
+
|
|
35
|
+
self.prompt = prompt
|
|
36
|
+
self.revision = revision
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def mteb_model_meta(self):
|
|
40
|
+
"""Model metadata for MTEB (Multilingual Task Embeddings Benchmark)"""
|
|
41
|
+
from mteb import ModelMeta
|
|
42
|
+
|
|
43
|
+
return ModelMeta(
|
|
44
|
+
name=os.path.basename(self.model_name_or_path),
|
|
45
|
+
revision=self.revision,
|
|
46
|
+
languages=None,
|
|
47
|
+
release_date=None,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
51
|
+
"""Embed search docs. Compact langchain.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
texts: List of text to embed.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
List of embeddings.
|
|
58
|
+
"""
|
|
59
|
+
return self.encode_corpus(texts).tolist()
|
|
60
|
+
|
|
61
|
+
def embed_query(self, text: str) -> List[float]:
|
|
62
|
+
"""Embed query text. Compact langchain.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
text: Text to embed.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Embedding.
|
|
69
|
+
"""
|
|
70
|
+
return self.encode_queries(text).tolist()
|
|
71
|
+
|
|
72
|
+
def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
|
|
73
|
+
"""Embed text."""
|
|
74
|
+
raise NotImplementedError
|
|
75
|
+
|
|
76
|
+
def encode_queries(self, queries: List[str], **kwargs) -> list[torch.Tensor]:
|
|
77
|
+
"""Embed query text. Compact mteb."""
|
|
78
|
+
raise NotImplementedError
|
|
79
|
+
|
|
80
|
+
def encode_corpus(self, corpus: List[str] | List[Dict[str, str]], **kwargs) -> list[torch.Tensor]:
|
|
81
|
+
"""Embed search docs . Compact mteb."""
|
|
82
|
+
raise NotImplementedError
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class SentenceTransformerModel(BaseModel):
|
|
86
|
+
def __init__(
|
|
87
|
+
self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs
|
|
88
|
+
):
|
|
89
|
+
super().__init__(model_name_or_path, **kwargs)
|
|
90
|
+
|
|
91
|
+
if not pooling_mode:
|
|
92
|
+
self.model = SentenceTransformer(
|
|
93
|
+
self.model_name_or_path,
|
|
94
|
+
config_kwargs=self.config_kwargs,
|
|
95
|
+
model_kwargs=self.model_kwargs,
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
word_embedding_model = models.Transformer(
|
|
99
|
+
self.model_name_or_path,
|
|
100
|
+
config_args=self.config_kwargs,
|
|
101
|
+
model_args=self.model_kwargs,
|
|
102
|
+
)
|
|
103
|
+
pooling_model = models.Pooling(
|
|
104
|
+
word_embedding_model.get_word_embedding_dimension(),
|
|
105
|
+
pooling_mode=pooling_mode,
|
|
106
|
+
)
|
|
107
|
+
self.model = SentenceTransformer(
|
|
108
|
+
modules=[word_embedding_model, pooling_model],
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
self.model.max_seq_length = self.max_seq_length
|
|
112
|
+
|
|
113
|
+
def encode(self, texts: Union[str, List[str]], prompt=None, **kwargs) -> List[torch.Tensor]:
|
|
114
|
+
kwargs.pop('prompt_name', '') # remove prompt name, use prompt
|
|
115
|
+
self.encode_kwargs.update(kwargs)
|
|
116
|
+
|
|
117
|
+
embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
|
|
118
|
+
assert isinstance(embeddings, Tensor)
|
|
119
|
+
return embeddings.cpu().detach()
|
|
120
|
+
|
|
121
|
+
def encode_queries(self, queries, **kwargs):
|
|
122
|
+
return self.encode(queries, prompt=self.prompt)
|
|
123
|
+
|
|
124
|
+
def encode_corpus(self, corpus, **kwargs):
|
|
125
|
+
if isinstance(corpus[0], dict):
|
|
126
|
+
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
|
|
127
|
+
else:
|
|
128
|
+
input_texts = corpus
|
|
129
|
+
return self.encode(input_texts)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class CrossEncoderModel(BaseModel):
|
|
133
|
+
def __init__(self, model_name_or_path: str, **kwargs):
|
|
134
|
+
super().__init__(model_name_or_path, **kwargs)
|
|
135
|
+
self.model = CrossEncoder(
|
|
136
|
+
self.model_name_or_path,
|
|
137
|
+
trust_remote_code=True,
|
|
138
|
+
max_length=self.max_seq_length,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
|
|
142
|
+
self.encode_kwargs.update(kwargs)
|
|
143
|
+
|
|
144
|
+
if len(sentences[0]) == 3: # Note: For mteb retrieval task
|
|
145
|
+
processed_sentences = []
|
|
146
|
+
for query, docs, instruction in sentences:
|
|
147
|
+
if isinstance(docs, dict):
|
|
148
|
+
docs = docs['text']
|
|
149
|
+
processed_sentences.append((self.prompt + query, docs))
|
|
150
|
+
sentences = processed_sentences
|
|
151
|
+
embeddings = self.model.predict(sentences, **self.encode_kwargs)
|
|
152
|
+
assert isinstance(embeddings, Tensor)
|
|
153
|
+
return embeddings
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class EmbeddingModel:
|
|
157
|
+
"""Custom embeddings"""
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def load(
|
|
161
|
+
model_name_or_path: str = '',
|
|
162
|
+
is_cross_encoder: bool = False,
|
|
163
|
+
hub: str = 'modelscope',
|
|
164
|
+
revision: Optional[str] = 'master',
|
|
165
|
+
**kwargs,
|
|
166
|
+
):
|
|
167
|
+
# If model path does not exist and hub is 'modelscope', download the model
|
|
168
|
+
if not os.path.exists(model_name_or_path) and hub == 'modelscope':
|
|
169
|
+
model_name_or_path = download_model(model_name_or_path, revision)
|
|
170
|
+
|
|
171
|
+
# Return different model instances based on whether it is a cross-encoder and pooling mode
|
|
172
|
+
if is_cross_encoder:
|
|
173
|
+
return CrossEncoderModel(
|
|
174
|
+
model_name_or_path,
|
|
175
|
+
revision=revision,
|
|
176
|
+
**kwargs,
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
return SentenceTransformerModel(
|
|
180
|
+
model_name_or_path,
|
|
181
|
+
revision=revision,
|
|
182
|
+
**kwargs,
|
|
183
|
+
)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
3
|
+
from modelscope.utils.hf_util import GenerationConfig
|
|
4
|
+
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
|
5
|
+
from langchain_core.language_models.llms import LLM as BaseLLM
|
|
6
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter
|
|
7
|
+
from langchain_openai import ChatOpenAI
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LLM:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def load(**kw):
|
|
13
|
+
api_base = kw.get('api_base', None)
|
|
14
|
+
if api_base:
|
|
15
|
+
return ChatOpenAI(
|
|
16
|
+
model_name=kw.get('model_name', ''),
|
|
17
|
+
openai_api_base=api_base,
|
|
18
|
+
openai_api_key=kw.get('api_key', 'EMPTY'),
|
|
19
|
+
)
|
|
20
|
+
else:
|
|
21
|
+
return LocalLLM(**kw)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LocalLLM(BaseLLM):
|
|
25
|
+
"""A custom LLM that loads a model from a given path and performs inference."""
|
|
26
|
+
|
|
27
|
+
model_name_or_path: str
|
|
28
|
+
model_revision: str = 'master'
|
|
29
|
+
template_type: str = 'default'
|
|
30
|
+
model_name: Optional[str]
|
|
31
|
+
model: Optional[ChatGenerationModelAdapter]
|
|
32
|
+
generation_config: Optional[Dict]
|
|
33
|
+
|
|
34
|
+
def __init__(self, **kw):
|
|
35
|
+
super().__init__(**kw)
|
|
36
|
+
self.model_name = os.path.basename(self.model_name_or_path)
|
|
37
|
+
self.model = ChatGenerationModelAdapter(
|
|
38
|
+
model_id=self.model_name_or_path,
|
|
39
|
+
model_revision=self.model_revision,
|
|
40
|
+
template_type=self.template_type,
|
|
41
|
+
generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def _call(
|
|
45
|
+
self,
|
|
46
|
+
prompt: str,
|
|
47
|
+
stop: Optional[List[str]] = None,
|
|
48
|
+
run_manager: Optional[CallbackManagerForLLMRun] = None,
|
|
49
|
+
**kwargs: Any,
|
|
50
|
+
) -> str:
|
|
51
|
+
"""Run the LLM on the given input."""
|
|
52
|
+
infer_cfg = {'stop': stop}
|
|
53
|
+
|
|
54
|
+
response = self.model._model_generate(prompt, infer_cfg)
|
|
55
|
+
return response
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def _identifying_params(self) -> Dict[str, Any]:
|
|
59
|
+
"""Return a dictionary of identifying parameters."""
|
|
60
|
+
return {
|
|
61
|
+
# The model name allows users to specify custom token counting
|
|
62
|
+
# rules in LLM monitoring applications (e.g., in LangSmith users
|
|
63
|
+
# can provide per token pricing for their model and monitor
|
|
64
|
+
# costs for the given LLM.)
|
|
65
|
+
'model_name': self.model_name,
|
|
66
|
+
'revision': self.model_revision,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def _llm_type(self) -> str:
|
|
71
|
+
"""Get the type of language model used by this chat model. Used for logging purposes only."""
|
|
72
|
+
return self.model_name
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
import base64
|
|
4
|
+
from modelscope import snapshot_download
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def PIL_to_bytes(image_format, **kwargs):
|
|
11
|
+
OPTIONS = {
|
|
12
|
+
"webp": dict(format="webp", lossless=True),
|
|
13
|
+
"png": dict(format="png"),
|
|
14
|
+
"jpg": dict(format="jpeg"),
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
def transform(image):
|
|
18
|
+
bytestream = io.BytesIO()
|
|
19
|
+
image.save(bytestream, **OPTIONS[image_format])
|
|
20
|
+
return bytestream.getvalue()
|
|
21
|
+
|
|
22
|
+
return transform
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def PIL_to_base64(image, **kwargs):
|
|
26
|
+
bytestream = io.BytesIO()
|
|
27
|
+
image.save(bytestream, format="jpeg")
|
|
28
|
+
return base64.b64encode(bytestream.getvalue()).decode("utf-8")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def path_to_bytes(filepath):
|
|
32
|
+
with open(filepath, "rb") as fp:
|
|
33
|
+
return fp.read()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def path_to_base64(filepath):
|
|
37
|
+
file_content = path_to_bytes(filepath)
|
|
38
|
+
return base64.b64encode(file_content).decode("utf-8")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def ensure_dir(file_path):
|
|
42
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def save_to_jsonl(df, file_path):
|
|
46
|
+
ensure_dir(file_path)
|
|
47
|
+
df.to_json(file_path, orient="records", lines=True, force_ascii=False)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def save_to_tsv(df, file_path):
|
|
51
|
+
ensure_dir(file_path)
|
|
52
|
+
df.to_csv(file_path, sep="\t", index=False)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def download_model(model_id: str, revision: str):
|
|
56
|
+
"""
|
|
57
|
+
default base dir: '~/.cache/modelscope/hub/model_id'
|
|
58
|
+
"""
|
|
59
|
+
logger.info(f"Loading model {model_id} from modelscope")
|
|
60
|
+
|
|
61
|
+
model_path = snapshot_download(model_id=model_id, revision=revision)
|
|
62
|
+
|
|
63
|
+
return model_path
|
|
@@ -51,7 +51,7 @@ try:
|
|
|
51
51
|
punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
|
|
52
52
|
|
|
53
53
|
if not os.path.exists(punkt_path):
|
|
54
|
-
os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
|
|
54
|
+
os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
|
|
55
55
|
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
56
56
|
else:
|
|
57
57
|
logger.info(f'{punkt_path} already exists, skipping download')
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -19,22 +19,22 @@ Requires-Dist: torch
|
|
|
19
19
|
Requires-Dist: absl-py
|
|
20
20
|
Requires-Dist: accelerate
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
|
-
Requires-Dist: datasets
|
|
22
|
+
Requires-Dist: datasets<=3.0.1,>=3.0.0
|
|
23
23
|
Requires-Dist: editdistance
|
|
24
24
|
Requires-Dist: jsonlines
|
|
25
25
|
Requires-Dist: matplotlib
|
|
26
26
|
Requires-Dist: modelscope[framework]
|
|
27
|
-
Requires-Dist: nltk
|
|
27
|
+
Requires-Dist: nltk>=3.9
|
|
28
28
|
Requires-Dist: openai
|
|
29
29
|
Requires-Dist: pandas
|
|
30
30
|
Requires-Dist: plotly
|
|
31
|
-
Requires-Dist: pyarrow
|
|
31
|
+
Requires-Dist: pyarrow<=17.0.0
|
|
32
32
|
Requires-Dist: pympler
|
|
33
33
|
Requires-Dist: pyyaml
|
|
34
34
|
Requires-Dist: regex
|
|
35
35
|
Requires-Dist: requests
|
|
36
36
|
Requires-Dist: requests-toolbelt
|
|
37
|
-
Requires-Dist: rouge-score
|
|
37
|
+
Requires-Dist: rouge-score>=0.1.0
|
|
38
38
|
Requires-Dist: sacrebleu
|
|
39
39
|
Requires-Dist: scikit-learn
|
|
40
40
|
Requires-Dist: seaborn
|
|
@@ -43,82 +43,82 @@ Requires-Dist: simple-ddl-parser
|
|
|
43
43
|
Requires-Dist: tabulate
|
|
44
44
|
Requires-Dist: tiktoken
|
|
45
45
|
Requires-Dist: tqdm
|
|
46
|
-
Requires-Dist: transformers
|
|
46
|
+
Requires-Dist: transformers>=4.33
|
|
47
47
|
Requires-Dist: transformers-stream-generator
|
|
48
48
|
Requires-Dist: jieba
|
|
49
49
|
Requires-Dist: rouge-chinese
|
|
50
50
|
Provides-Extra: all
|
|
51
|
-
Requires-Dist: torch
|
|
52
|
-
Requires-Dist: absl-py
|
|
53
|
-
Requires-Dist: accelerate
|
|
54
|
-
Requires-Dist: cachetools
|
|
55
|
-
Requires-Dist: datasets
|
|
56
|
-
Requires-Dist: editdistance
|
|
57
|
-
Requires-Dist: jsonlines
|
|
58
|
-
Requires-Dist: matplotlib
|
|
59
|
-
Requires-Dist: modelscope[framework]
|
|
60
|
-
Requires-Dist: nltk
|
|
61
|
-
Requires-Dist: openai
|
|
62
|
-
Requires-Dist: pandas
|
|
63
|
-
Requires-Dist: plotly
|
|
64
|
-
Requires-Dist: pyarrow
|
|
65
|
-
Requires-Dist: pympler
|
|
66
|
-
Requires-Dist: pyyaml
|
|
67
|
-
Requires-Dist: regex
|
|
68
|
-
Requires-Dist: requests
|
|
69
|
-
Requires-Dist: requests-toolbelt
|
|
70
|
-
Requires-Dist: rouge-score
|
|
71
|
-
Requires-Dist: sacrebleu
|
|
72
|
-
Requires-Dist: scikit-learn
|
|
73
|
-
Requires-Dist: seaborn
|
|
74
|
-
Requires-Dist: sentencepiece
|
|
75
|
-
Requires-Dist: simple-ddl-parser
|
|
76
|
-
Requires-Dist: tabulate
|
|
77
|
-
Requires-Dist: tiktoken
|
|
78
|
-
Requires-Dist: tqdm
|
|
79
|
-
Requires-Dist: transformers
|
|
80
|
-
Requires-Dist: transformers-stream-generator
|
|
81
|
-
Requires-Dist: jieba
|
|
82
|
-
Requires-Dist: rouge-chinese
|
|
83
|
-
Requires-Dist: ms-opencompass
|
|
84
|
-
Requires-Dist: ms-vlmeval
|
|
85
|
-
Requires-Dist: mteb
|
|
86
|
-
Requires-Dist: ragas
|
|
87
|
-
Requires-Dist: webdataset
|
|
51
|
+
Requires-Dist: torch; extra == "all"
|
|
52
|
+
Requires-Dist: absl-py; extra == "all"
|
|
53
|
+
Requires-Dist: accelerate; extra == "all"
|
|
54
|
+
Requires-Dist: cachetools; extra == "all"
|
|
55
|
+
Requires-Dist: datasets<=3.0.1,>=3.0.0; extra == "all"
|
|
56
|
+
Requires-Dist: editdistance; extra == "all"
|
|
57
|
+
Requires-Dist: jsonlines; extra == "all"
|
|
58
|
+
Requires-Dist: matplotlib; extra == "all"
|
|
59
|
+
Requires-Dist: modelscope[framework]; extra == "all"
|
|
60
|
+
Requires-Dist: nltk>=3.9; extra == "all"
|
|
61
|
+
Requires-Dist: openai; extra == "all"
|
|
62
|
+
Requires-Dist: pandas; extra == "all"
|
|
63
|
+
Requires-Dist: plotly; extra == "all"
|
|
64
|
+
Requires-Dist: pyarrow<=17.0.0; extra == "all"
|
|
65
|
+
Requires-Dist: pympler; extra == "all"
|
|
66
|
+
Requires-Dist: pyyaml; extra == "all"
|
|
67
|
+
Requires-Dist: regex; extra == "all"
|
|
68
|
+
Requires-Dist: requests; extra == "all"
|
|
69
|
+
Requires-Dist: requests-toolbelt; extra == "all"
|
|
70
|
+
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
71
|
+
Requires-Dist: sacrebleu; extra == "all"
|
|
72
|
+
Requires-Dist: scikit-learn; extra == "all"
|
|
73
|
+
Requires-Dist: seaborn; extra == "all"
|
|
74
|
+
Requires-Dist: sentencepiece; extra == "all"
|
|
75
|
+
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
76
|
+
Requires-Dist: tabulate; extra == "all"
|
|
77
|
+
Requires-Dist: tiktoken; extra == "all"
|
|
78
|
+
Requires-Dist: tqdm; extra == "all"
|
|
79
|
+
Requires-Dist: transformers>=4.33; extra == "all"
|
|
80
|
+
Requires-Dist: transformers-stream-generator; extra == "all"
|
|
81
|
+
Requires-Dist: jieba; extra == "all"
|
|
82
|
+
Requires-Dist: rouge-chinese; extra == "all"
|
|
83
|
+
Requires-Dist: ms-opencompass>=0.1.3; extra == "all"
|
|
84
|
+
Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
|
|
85
|
+
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
86
|
+
Requires-Dist: ragas==0.2.5; extra == "all"
|
|
87
|
+
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
88
88
|
Provides-Extra: inner
|
|
89
|
-
Requires-Dist: absl-py
|
|
90
|
-
Requires-Dist: accelerate
|
|
91
|
-
Requires-Dist: alibaba-itag-sdk
|
|
92
|
-
Requires-Dist: dashscope
|
|
93
|
-
Requires-Dist: editdistance
|
|
94
|
-
Requires-Dist: jsonlines
|
|
95
|
-
Requires-Dist: nltk
|
|
96
|
-
Requires-Dist: openai
|
|
97
|
-
Requires-Dist: pandas
|
|
98
|
-
Requires-Dist: plotly
|
|
99
|
-
Requires-Dist: pyarrow
|
|
100
|
-
Requires-Dist: pyodps
|
|
101
|
-
Requires-Dist: pyyaml
|
|
102
|
-
Requires-Dist: regex
|
|
103
|
-
Requires-Dist: requests
|
|
104
|
-
Requires-Dist: requests-toolbelt
|
|
105
|
-
Requires-Dist: rouge-score
|
|
106
|
-
Requires-Dist: sacrebleu
|
|
107
|
-
Requires-Dist: scikit-learn
|
|
108
|
-
Requires-Dist: seaborn
|
|
109
|
-
Requires-Dist: simple-ddl-parser
|
|
110
|
-
Requires-Dist: streamlit
|
|
111
|
-
Requires-Dist: tqdm
|
|
112
|
-
Requires-Dist: transformers
|
|
113
|
-
Requires-Dist: transformers-stream-generator
|
|
89
|
+
Requires-Dist: absl-py; extra == "inner"
|
|
90
|
+
Requires-Dist: accelerate; extra == "inner"
|
|
91
|
+
Requires-Dist: alibaba-itag-sdk; extra == "inner"
|
|
92
|
+
Requires-Dist: dashscope; extra == "inner"
|
|
93
|
+
Requires-Dist: editdistance; extra == "inner"
|
|
94
|
+
Requires-Dist: jsonlines; extra == "inner"
|
|
95
|
+
Requires-Dist: nltk; extra == "inner"
|
|
96
|
+
Requires-Dist: openai; extra == "inner"
|
|
97
|
+
Requires-Dist: pandas==1.5.3; extra == "inner"
|
|
98
|
+
Requires-Dist: plotly; extra == "inner"
|
|
99
|
+
Requires-Dist: pyarrow; extra == "inner"
|
|
100
|
+
Requires-Dist: pyodps; extra == "inner"
|
|
101
|
+
Requires-Dist: pyyaml; extra == "inner"
|
|
102
|
+
Requires-Dist: regex; extra == "inner"
|
|
103
|
+
Requires-Dist: requests==2.28.1; extra == "inner"
|
|
104
|
+
Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
|
|
105
|
+
Requires-Dist: rouge-score; extra == "inner"
|
|
106
|
+
Requires-Dist: sacrebleu; extra == "inner"
|
|
107
|
+
Requires-Dist: scikit-learn; extra == "inner"
|
|
108
|
+
Requires-Dist: seaborn; extra == "inner"
|
|
109
|
+
Requires-Dist: simple-ddl-parser; extra == "inner"
|
|
110
|
+
Requires-Dist: streamlit; extra == "inner"
|
|
111
|
+
Requires-Dist: tqdm; extra == "inner"
|
|
112
|
+
Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
|
|
113
|
+
Requires-Dist: transformers-stream-generator; extra == "inner"
|
|
114
114
|
Provides-Extra: opencompass
|
|
115
|
-
Requires-Dist: ms-opencompass
|
|
115
|
+
Requires-Dist: ms-opencompass>=0.1.3; extra == "opencompass"
|
|
116
116
|
Provides-Extra: rag
|
|
117
|
-
Requires-Dist: mteb
|
|
118
|
-
Requires-Dist: ragas
|
|
119
|
-
Requires-Dist: webdataset
|
|
117
|
+
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
118
|
+
Requires-Dist: ragas==0.2.5; extra == "rag"
|
|
119
|
+
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
120
120
|
Provides-Extra: vlmeval
|
|
121
|
-
Requires-Dist: ms-vlmeval
|
|
121
|
+
Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
|
|
122
122
|
|
|
123
123
|
|
|
124
124
|
|
|
@@ -139,6 +139,7 @@ Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'vlmeval'
|
|
|
139
139
|
<a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
|
|
140
140
|
<p>
|
|
141
141
|
|
|
142
|
+
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
142
143
|
|
|
143
144
|
## 📋 Table of Contents
|
|
144
145
|
- [Introduction](#introduction)
|
|
@@ -164,7 +165,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
|
|
|
164
165
|
The architecture includes the following modules:
|
|
165
166
|
1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
|
|
166
167
|
2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
|
|
167
|
-
3. **Evaluation Backend**:
|
|
168
|
+
3. **Evaluation Backend**:
|
|
168
169
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
169
170
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
170
171
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
@@ -251,7 +252,7 @@ You can execute this command from any directory:
|
|
|
251
252
|
python -m evalscope.run \
|
|
252
253
|
--model qwen/Qwen2-0.5B-Instruct \
|
|
253
254
|
--template-type qwen \
|
|
254
|
-
--datasets arc
|
|
255
|
+
--datasets arc
|
|
255
256
|
```
|
|
256
257
|
|
|
257
258
|
#### Install from source
|
|
@@ -358,13 +359,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
|
|
|
358
359
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
359
360
|
|
|
360
361
|
## Offline Evaluation
|
|
361
|
-
You can use local dataset to evaluate the model without internet connection.
|
|
362
|
+
You can use local dataset to evaluate the model without internet connection.
|
|
362
363
|
|
|
363
364
|
Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
|
|
364
365
|
|
|
365
366
|
|
|
366
367
|
## Arena Mode
|
|
367
|
-
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
368
|
+
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
368
369
|
|
|
369
370
|
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
370
371
|
|
|
@@ -6,7 +6,7 @@ evalscope/run.py,sha256=uAXtaxIBcR94jyfHGFAecuzn0y71oLgu-d9VOohCJAw,18738
|
|
|
6
6
|
evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
|
|
7
7
|
evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
|
|
8
8
|
evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
|
|
9
|
-
evalscope/version.py,sha256=
|
|
9
|
+
evalscope/version.py,sha256=o4SLhBjhMLzVbUK1flGxf-kiqIBLnLnJbxG06BmvkyU,118
|
|
10
10
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
|
|
12
12
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -14,7 +14,7 @@ evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG
|
|
|
14
14
|
evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZcngqD8rxvI5EijLUh_zI,10294
|
|
15
15
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
16
16
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
|
|
17
|
-
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=
|
|
17
|
+
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=3V67A2LSj_XaiGd9fqdKpxpzyNrfynCH3UnhaBtAaqc,5326
|
|
18
18
|
evalscope/backend/rag_eval/__init__.py,sha256=8om6TVnTMmyTEQt1jBuUQA4UfIzyps-_-ih90H_Qjio,284
|
|
19
19
|
evalscope/backend/rag_eval/backend_manager.py,sha256=jmO-UMu6_iOXMnl4--PrMWCsnIYEhsbiX017rtURqm0,2997
|
|
20
20
|
evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=gDXCiRUTSeGQHxd5SjQsnphMqHJ2si2jywRiHvujEOg,150
|
|
@@ -44,8 +44,13 @@ evalscope/backend/rag_eval/ragas/metrics/__init__.py,sha256=HgY5nrcNtWpQ7gBi5lCE
|
|
|
44
44
|
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py,sha256=Uqz5qWZ76Gos95_QlhwncbATXyk0YX4wkI0LiAdPElU,3838
|
|
45
45
|
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py,sha256=CdLnWHq1eTna6j3F5-pncW5YusxD_v3ScjzeCsZ7mng,3967
|
|
46
46
|
evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=WO2xja0g0JSiYGdu2uAEDQgDceuFcgPWwPoqFnwDU0s,172
|
|
47
|
-
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=
|
|
47
|
+
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=nX-dG0Fm1629pSASujuEmMODFZf1955WncNNykRrNtI,9305
|
|
48
48
|
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=bXOqik6qKWzbrEz21ykdkqeqqPrmoUIhTwW6eRQXy0M,2222
|
|
49
|
+
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
+
evalscope/backend/rag_eval/utils/clip.py,sha256=frafvJ1soUtjFUmi-053_Fhg6ERRwyvczQBlLWAX9vE,5104
|
|
51
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=RZf0JlovZY_cCBsq8MMUqC_Sy78WtKLY_rBAlRA_udo,6239
|
|
52
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=9tFwMNoTf3jNomgDu5qqVLO92HtEtelH3DXpny9_B2g,2552
|
|
53
|
+
evalscope/backend/rag_eval/utils/tools.py,sha256=LpcYoeIBj1btzQ1_P84u1dYCdRWhMtiltxihmZCvWKk,1528
|
|
49
54
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
|
|
50
55
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ewhpE9yzsqf5ED6kqsqek2YEgg96GBQOupxtVNhaXxI,6046
|
|
51
56
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=Yz2A5kB1E8DYBnjuVCA6TTPtLjhg8vYKeJTh6FU_Ecw,1645
|
|
@@ -132,7 +137,7 @@ evalscope/metrics/math_accuracy.py,sha256=1PCy1VUNYg48JcGy-6SUmUDZNwPeAkMW1QQ_lX
|
|
|
132
137
|
evalscope/metrics/metrics.py,sha256=sDZljGiZwgHsFZ5eNi65-3z3BLCdIwWUzPcq2QpKf1k,12545
|
|
133
138
|
evalscope/metrics/rouge_metric.py,sha256=sN0r-sXXc-nJUdFrthQPAv1VFdOCrF6zzIYDKaLSgrU,4522
|
|
134
139
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
135
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
140
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=MXcHwmsXnh9mQZR1Bt5St6DNwXY-mfz4dNM8y6a23dc,12236
|
|
136
141
|
evalscope/models/__init__.py,sha256=zG27J2HSeKPGiAIUE7QLPHEPLyXLsfaDwYI_TDXjpCg,145
|
|
137
142
|
evalscope/models/dummy_chat_model.py,sha256=xE8wcFVSCkvizEJ-B8ojX0Ir01Q5KrN5mapjMQaQtbg,1325
|
|
138
143
|
evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
|
|
@@ -204,8 +209,8 @@ evalscope/utils/logger.py,sha256=cf3U400Mx1speMMNXorjwEE8noDz5Mbd-9PNgaulGeY,301
|
|
|
204
209
|
evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
|
|
205
210
|
evalscope/utils/task_utils.py,sha256=IMtBSBUp3H95Ko0vn8Q55Wmz2SFZXSfjVy49tyomL_g,537
|
|
206
211
|
evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
|
|
207
|
-
evalscope-0.6.
|
|
208
|
-
evalscope-0.6.
|
|
209
|
-
evalscope-0.6.
|
|
210
|
-
evalscope-0.6.
|
|
211
|
-
evalscope-0.6.
|
|
212
|
+
evalscope-0.6.1.dist-info/METADATA,sha256=n4CpTzJGnhgqEsfbL1UfZtXHULmeNCGnKChyi6eT8Fw,21237
|
|
213
|
+
evalscope-0.6.1.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
|
|
214
|
+
evalscope-0.6.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
215
|
+
evalscope-0.6.1.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
|
|
216
|
+
evalscope-0.6.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|