evalscope 0.5.5rc0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +0 -3
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
- evalscope/backend/rag_eval/__init__.py +4 -0
- evalscope/backend/rag_eval/backend_manager.py +80 -0
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +2 -0
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +34 -0
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +277 -0
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +119 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +83 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +247 -0
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +170 -0
- evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
- evalscope/backend/rag_eval/cmteb/arguments.py +61 -0
- evalscope/backend/rag_eval/cmteb/base.py +91 -0
- evalscope/backend/rag_eval/cmteb/task_template.py +85 -0
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +61 -0
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +151 -0
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +70 -0
- evalscope/backend/rag_eval/ragas/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/arguments.py +47 -0
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +91 -0
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +99 -0
- evalscope/backend/rag_eval/ragas/task_template.py +61 -0
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +263 -0
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +72 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/evaluator/evaluator.py +1 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +19 -0
- evalscope/models/api/openai_api.py +2 -2
- evalscope/perf/http_client.py +1 -1
- evalscope/perf/openai_api.py +2 -0
- evalscope/run.py +4 -0
- evalscope/utils/logger.py +44 -14
- evalscope/utils/task_utils.py +3 -0
- evalscope/version.py +2 -2
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/METADATA +95 -99
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/RECORD +49 -18
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/WHEEL +1 -1
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code adapated from https://github.com/mlfoundations/open_clip/blob/main/src/training/zero_shot.py
|
|
3
|
+
Thanks to the authors of OpenCLIP
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
from contextlib import suppress
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn.functional as F
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
from sklearn.metrics import classification_report, balanced_accuracy_score
|
|
14
|
+
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def zero_shot_classifier(model, classnames, templates, device, amp=True):
|
|
21
|
+
"""
|
|
22
|
+
This function returns zero-shot vectors for each class in order
|
|
23
|
+
to use it for zero-shot classification.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
model:
|
|
27
|
+
CLIP-like model with `encode_text`
|
|
28
|
+
|
|
29
|
+
classnames: list of str
|
|
30
|
+
name of classes
|
|
31
|
+
|
|
32
|
+
templates: list of str
|
|
33
|
+
templates to use.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
|
|
38
|
+
torch.Tensor of shape (N,C) where N is the number
|
|
39
|
+
of templates, and C is the number of classes.
|
|
40
|
+
"""
|
|
41
|
+
autocast = torch.amp.autocast if amp else suppress
|
|
42
|
+
with torch.no_grad(), autocast(device):
|
|
43
|
+
zeroshot_weights = []
|
|
44
|
+
for classname in tqdm(classnames):
|
|
45
|
+
if type(templates) == dict:
|
|
46
|
+
# class-specific prompts (e.g., CuPL https://arxiv.org/abs/2209.03320)
|
|
47
|
+
texts = templates[classname]
|
|
48
|
+
elif type(templates) == list:
|
|
49
|
+
# generic prompts tht are specialized for each class by replacing {c} with the class name
|
|
50
|
+
texts = [template.format(c=classname) for template in templates]
|
|
51
|
+
else:
|
|
52
|
+
raise ValueError("templates must be a list or a dict")
|
|
53
|
+
class_embedding = model.encode_text(texts).mean(dim=0)
|
|
54
|
+
class_embedding = F.normalize(class_embedding, dim=0)
|
|
55
|
+
zeroshot_weights.append(class_embedding)
|
|
56
|
+
zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(device)
|
|
57
|
+
return zeroshot_weights
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def accuracy(output, target, topk=(1,)):
|
|
61
|
+
"""
|
|
62
|
+
Compute top-k accuracy
|
|
63
|
+
|
|
64
|
+
output: torch.Tensor
|
|
65
|
+
shape (N, C) where N is the number of examples, C the number of classes.
|
|
66
|
+
these are the logits.
|
|
67
|
+
|
|
68
|
+
target: torch.Tensor
|
|
69
|
+
shape (N,) where N is the number of examples. Groundtruth class id of each example.
|
|
70
|
+
|
|
71
|
+
topk: tuple
|
|
72
|
+
which topk to compute, e.g., topk=(1,5) will compute top-1 and top-5 accuracies
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
|
|
77
|
+
list of top-k accuracies in the same order as `topk`
|
|
78
|
+
"""
|
|
79
|
+
pred = output.topk(max(topk), 1, True, True)[1].t()
|
|
80
|
+
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
|
81
|
+
n = len(target)
|
|
82
|
+
return [
|
|
83
|
+
float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) / n
|
|
84
|
+
for k in topk
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def run_classification(model, classifier, dataloader, device, amp=True, limit=None):
|
|
89
|
+
"""
|
|
90
|
+
Run zero-shot classifcation
|
|
91
|
+
|
|
92
|
+
model: torch.nn.Module
|
|
93
|
+
CLIP-like model with `encode_image` and `encode_text`
|
|
94
|
+
|
|
95
|
+
classifier: torch.Tensor
|
|
96
|
+
obtained from the function `zero_shot_classifier`
|
|
97
|
+
|
|
98
|
+
dataloader: torch.utils.data.Dataloader
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
(pred, true) where
|
|
103
|
+
- pred (N, C) are the logits
|
|
104
|
+
- true (N,) are the actual classes
|
|
105
|
+
"""
|
|
106
|
+
autocast = torch.amp.autocast if amp else suppress
|
|
107
|
+
pred = []
|
|
108
|
+
true = []
|
|
109
|
+
sample_count = 0
|
|
110
|
+
with torch.no_grad():
|
|
111
|
+
for images, target in tqdm(dataloader):
|
|
112
|
+
target = target.to(device)
|
|
113
|
+
|
|
114
|
+
with autocast(device):
|
|
115
|
+
# predict
|
|
116
|
+
image_features = model.encode_image(images)
|
|
117
|
+
logits = 100.0 * image_features @ classifier
|
|
118
|
+
|
|
119
|
+
if limit is not None:
|
|
120
|
+
# Update sample counter
|
|
121
|
+
sample_count += len(images)
|
|
122
|
+
|
|
123
|
+
if sample_count >= limit:
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
true.append(target.cpu())
|
|
127
|
+
pred.append(logits.float().cpu())
|
|
128
|
+
|
|
129
|
+
pred = torch.cat(pred)
|
|
130
|
+
true = torch.cat(true)
|
|
131
|
+
return pred, true
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def average_precision_per_class(scores, targets):
|
|
135
|
+
"""
|
|
136
|
+
Compute average precision for each class
|
|
137
|
+
this metric is used for multi-label classification
|
|
138
|
+
see explanations here https://fangdahan.medium.com/calculate-mean-average-precision-map-for-multi-label-classification-b082679d31be
|
|
139
|
+
Code is adapted from https://github.com/pytorch/tnt/blob/master/torchnet/meter/meter.py, thanks to the authors of `tnt`.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
|
|
144
|
+
scores: torch.Tensor
|
|
145
|
+
logits, of shape (N,C) where N is the number of examples, C the number of classes
|
|
146
|
+
|
|
147
|
+
targets: torch.Tensor
|
|
148
|
+
one-hot vectors of groundtruth targets (N, C), where N is the number of examples, C is the
|
|
149
|
+
number of classes
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
|
|
154
|
+
torch.Tensor of shape (C,) of avereage precision for each class, where C is
|
|
155
|
+
the number of classes.
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
ap = torch.zeros(scores.size(1))
|
|
159
|
+
rg = torch.arange(1, scores.size(0) + 1).float()
|
|
160
|
+
# compute average precision for each class
|
|
161
|
+
for k in range(scores.size(1)):
|
|
162
|
+
# sort scores
|
|
163
|
+
scores_k = scores[:, k]
|
|
164
|
+
targets_k = targets[:, k]
|
|
165
|
+
_, sortind = torch.sort(scores_k, 0, True)
|
|
166
|
+
truth = targets_k[sortind]
|
|
167
|
+
tp = truth.float().cumsum(0)
|
|
168
|
+
# compute precision curve
|
|
169
|
+
precision = tp.div(rg)
|
|
170
|
+
# compute average precision
|
|
171
|
+
ap[k] = precision[truth.bool()].sum() / max(float(truth.sum()), 1)
|
|
172
|
+
return ap
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def evaluate(
|
|
176
|
+
model,
|
|
177
|
+
dataloader,
|
|
178
|
+
classnames,
|
|
179
|
+
templates,
|
|
180
|
+
device,
|
|
181
|
+
amp=True,
|
|
182
|
+
verbose=False,
|
|
183
|
+
limit=None,
|
|
184
|
+
):
|
|
185
|
+
"""
|
|
186
|
+
Run zero-shot classification and evaluate the metrics
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
|
|
191
|
+
model: torch.nn.Module
|
|
192
|
+
CLIP-like model with `encode_image` and `encode_text`
|
|
193
|
+
|
|
194
|
+
dataloader: torch.utils.data.Dataloader
|
|
195
|
+
|
|
196
|
+
classnames: list of str
|
|
197
|
+
class names
|
|
198
|
+
|
|
199
|
+
templates: list of str
|
|
200
|
+
templates to use for zero-shot classification
|
|
201
|
+
|
|
202
|
+
device: cpu/cuda
|
|
203
|
+
|
|
204
|
+
amp: whether to use automatic mixed precision
|
|
205
|
+
|
|
206
|
+
verbose: whether to use verbose model
|
|
207
|
+
|
|
208
|
+
Returns
|
|
209
|
+
-------
|
|
210
|
+
|
|
211
|
+
dict of classification metrics
|
|
212
|
+
"""
|
|
213
|
+
classifier = zero_shot_classifier(model, classnames, templates, device, amp=amp)
|
|
214
|
+
|
|
215
|
+
logits, target = run_classification(model, classifier, dataloader, device, amp=amp, limit=limit)
|
|
216
|
+
is_multilabel = len(target.shape) == 2
|
|
217
|
+
|
|
218
|
+
if is_multilabel:
|
|
219
|
+
if verbose:
|
|
220
|
+
logger.info("Detected a multi-label classification dataset")
|
|
221
|
+
# Multiple labels per image, multiple classes on the dataset
|
|
222
|
+
ap_per_class = average_precision_per_class(logits, target)
|
|
223
|
+
if verbose:
|
|
224
|
+
for class_name, ap in zip(
|
|
225
|
+
dataloader.dataset.classes, ap_per_class.tolist()
|
|
226
|
+
):
|
|
227
|
+
logger.info(f"Class: {class_name}, AveragePrecision: {ap}")
|
|
228
|
+
return {"mean_average_precision": ap_per_class.mean().item()}
|
|
229
|
+
else:
|
|
230
|
+
# Single label per image, multiple classes on the dataset
|
|
231
|
+
# just compute accuracy and mean_per_class_recall
|
|
232
|
+
|
|
233
|
+
pred = logits.argmax(axis=1)
|
|
234
|
+
# measure accuracy
|
|
235
|
+
if len(dataloader.dataset.classes) >= 5:
|
|
236
|
+
acc1, acc5 = accuracy(logits, target, topk=(1, 5))
|
|
237
|
+
else:
|
|
238
|
+
(acc1,) = accuracy(logits, target, topk=(1,))
|
|
239
|
+
acc5 = float("nan")
|
|
240
|
+
mean_per_class_recall = balanced_accuracy_score(target, pred)
|
|
241
|
+
if verbose:
|
|
242
|
+
logger.info("\n" + classification_report(target, pred, digits=3))
|
|
243
|
+
return {
|
|
244
|
+
"acc1": acc1,
|
|
245
|
+
"acc5": acc5,
|
|
246
|
+
"mean_per_class_recall": mean_per_class_recall,
|
|
247
|
+
}
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from contextlib import suppress
|
|
3
|
+
|
|
4
|
+
import torch
|
|
5
|
+
import torch.nn.functional as F
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def evaluate(model, dataloader, device, amp=True, recall_k_list=[5], limit=None):
|
|
13
|
+
"""
|
|
14
|
+
Evaluate the model on the given dataset
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
|
|
19
|
+
model: torch.nn.Module
|
|
20
|
+
CLIP-like model with `encode_image` and `encode_text`
|
|
21
|
+
|
|
22
|
+
dataloader: torch.utils.data.Dataloader
|
|
23
|
+
dataloader to use for evaluation
|
|
24
|
+
|
|
25
|
+
tokenizer:
|
|
26
|
+
text tokenizer, i.e. convert list of strings to torch.Tensor of integers
|
|
27
|
+
|
|
28
|
+
device: cpu/cuda
|
|
29
|
+
|
|
30
|
+
amp: whether to use automatic mixed precision
|
|
31
|
+
|
|
32
|
+
recall_k_list: list of int
|
|
33
|
+
recall@k k's to use
|
|
34
|
+
|
|
35
|
+
limit: int
|
|
36
|
+
maximum number of samples to evaluate
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
|
|
41
|
+
dict of retrieval metrics
|
|
42
|
+
"""
|
|
43
|
+
# list of batch of images embedding
|
|
44
|
+
batch_images_emb_list = []
|
|
45
|
+
# list of batch of text embedding
|
|
46
|
+
batch_texts_emb_list = []
|
|
47
|
+
# for each text, we collect the corresponding image index, as each image can have multiple corresponding texts
|
|
48
|
+
texts_image_index = []
|
|
49
|
+
sample_count = 0
|
|
50
|
+
dataloader = dataloader_with_indices(dataloader)
|
|
51
|
+
for batch_images, batch_texts, inds in tqdm(dataloader):
|
|
52
|
+
|
|
53
|
+
# store the index of image for each text
|
|
54
|
+
batch_texts_image_index = [
|
|
55
|
+
ind for ind, texts in zip(inds, batch_texts) for text in texts
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
# compute the embedding of images and texts
|
|
59
|
+
batch_images_emb = model.encode_image(batch_images)
|
|
60
|
+
batch_texts_emb = model.encode_text(batch_texts)
|
|
61
|
+
|
|
62
|
+
batch_images_emb_list.append(batch_images_emb.cpu())
|
|
63
|
+
batch_texts_emb_list.append(batch_texts_emb.cpu())
|
|
64
|
+
texts_image_index.extend(batch_texts_image_index)
|
|
65
|
+
|
|
66
|
+
if limit is not None:
|
|
67
|
+
# Update sample counter
|
|
68
|
+
sample_count += len(batch_images)
|
|
69
|
+
|
|
70
|
+
if sample_count >= limit:
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
batch_size = len(batch_images_emb_list[0])
|
|
74
|
+
|
|
75
|
+
# concatenate all embeddings
|
|
76
|
+
images_emb = torch.cat(batch_images_emb_list)
|
|
77
|
+
texts_emb = torch.cat(batch_texts_emb_list)
|
|
78
|
+
|
|
79
|
+
# get the score for each text and image pair
|
|
80
|
+
scores = texts_emb @ images_emb.t()
|
|
81
|
+
|
|
82
|
+
# construct a the positive pair matrix, which tells whether each text-image pair is a positive or not
|
|
83
|
+
positive_pairs = torch.zeros_like(scores, dtype=bool)
|
|
84
|
+
positive_pairs[torch.arange(len(scores)), texts_image_index] = True
|
|
85
|
+
metrics = {}
|
|
86
|
+
for recall_k in recall_k_list:
|
|
87
|
+
# Note that recall_at_k computes **actual** recall i.e. nb_true_positive/nb_positives, where the number
|
|
88
|
+
# of true positives, e.g. for text retrieval, is, for each image, the number of retrieved texts matching that image among the top-k.
|
|
89
|
+
# Also, the number of positives are the total number of texts matching the image in the dataset, as we have a set of captions
|
|
90
|
+
# for each image, that number will be greater than 1 for text retrieval.
|
|
91
|
+
# However, image/text retrieval recall@k, the way it is done in CLIP-like papers, is a bit different.
|
|
92
|
+
# recall@k, in CLIP-like papers, is, for each image, either 1 or 0. It is 1 if atleast one text matches the image among the top-k.
|
|
93
|
+
# so we can easily compute that using the actual recall, by checking whether there is at least one true positive,
|
|
94
|
+
# which would be the case if the recall is greater than 0. One we compute the recal for each image (or text), we average
|
|
95
|
+
# it over the dataset.
|
|
96
|
+
metrics[f"image_retrieval_recall@{recall_k}"] = (
|
|
97
|
+
(
|
|
98
|
+
batchify(
|
|
99
|
+
recall_at_k, scores, positive_pairs, batch_size, device, k=recall_k
|
|
100
|
+
)
|
|
101
|
+
> 0
|
|
102
|
+
)
|
|
103
|
+
.float()
|
|
104
|
+
.mean()
|
|
105
|
+
.item()
|
|
106
|
+
)
|
|
107
|
+
metrics[f"text_retrieval_recall@{recall_k}"] = (
|
|
108
|
+
(
|
|
109
|
+
batchify(
|
|
110
|
+
recall_at_k,
|
|
111
|
+
scores.T,
|
|
112
|
+
positive_pairs.T,
|
|
113
|
+
batch_size,
|
|
114
|
+
device,
|
|
115
|
+
k=recall_k,
|
|
116
|
+
)
|
|
117
|
+
> 0
|
|
118
|
+
)
|
|
119
|
+
.float()
|
|
120
|
+
.mean()
|
|
121
|
+
.item()
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return metrics
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def dataloader_with_indices(dataloader):
|
|
128
|
+
start = 0
|
|
129
|
+
for x, y in dataloader:
|
|
130
|
+
end = start + len(x)
|
|
131
|
+
inds = torch.arange(start, end)
|
|
132
|
+
yield x, y, inds
|
|
133
|
+
start = end
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def recall_at_k(scores, positive_pairs, k):
|
|
137
|
+
"""
|
|
138
|
+
Compute the recall at k for each sample
|
|
139
|
+
:param scores: compability score between text and image embeddings (nb texts, nb images)
|
|
140
|
+
:param k: number of images to consider per text, for retrieval
|
|
141
|
+
:param positive_pairs: boolean matrix of positive pairs (nb texts, nb images)
|
|
142
|
+
:return: recall at k averaged over all texts
|
|
143
|
+
"""
|
|
144
|
+
nb_texts, nb_images = scores.shape
|
|
145
|
+
# for each text, sort according to image scores in decreasing order
|
|
146
|
+
topk_indices = torch.topk(scores, k, dim=1)[1]
|
|
147
|
+
# compute number of positives for each text
|
|
148
|
+
nb_positive = positive_pairs.sum(dim=1)
|
|
149
|
+
# nb_texts, k, nb_images
|
|
150
|
+
topk_indices_onehot = torch.nn.functional.one_hot(
|
|
151
|
+
topk_indices, num_classes=nb_images
|
|
152
|
+
)
|
|
153
|
+
# compute number of true positives
|
|
154
|
+
positive_pairs_reshaped = positive_pairs.view(nb_texts, 1, nb_images)
|
|
155
|
+
# a true positive means a positive among the topk
|
|
156
|
+
nb_true_positive = (topk_indices_onehot * positive_pairs_reshaped).sum(dim=(1, 2))
|
|
157
|
+
# compute recall at k
|
|
158
|
+
recall_at_k = nb_true_positive / nb_positive
|
|
159
|
+
return recall_at_k
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def batchify(func, X, Y, batch_size, device, *args, **kwargs):
|
|
163
|
+
results = []
|
|
164
|
+
for start in range(0, len(X), batch_size):
|
|
165
|
+
end = start + batch_size
|
|
166
|
+
x = X[start:end].to(device)
|
|
167
|
+
y = Y[start:end].to(device)
|
|
168
|
+
result = func(x, y, *args, **kwargs).cpu()
|
|
169
|
+
results.append(result)
|
|
170
|
+
return torch.cat(results)
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
from evalscope.backend.rag_eval.cmteb.tasks import *
|
|
2
|
+
from evalscope.backend.rag_eval.cmteb.base import *
|
|
3
|
+
from evalscope.backend.rag_eval.cmteb.arguments import ModelArguments, EvalArguments
|
|
4
|
+
from evalscope.backend.rag_eval.cmteb.task_template import one_stage_eval, two_stage_eval
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List, Optional, Union, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class ModelArguments:
|
|
7
|
+
# Arguments for embeding model: sentence transformer or cross encoder
|
|
8
|
+
model_name_or_path: str = "" # model name or path
|
|
9
|
+
is_cross_encoder: bool = False # whether the model is a cross encoder
|
|
10
|
+
# pooling mode: Either “cls”, “lasttoken”, “max”, “mean”, “mean_sqrt_len_tokens”, or “weightedmean”.
|
|
11
|
+
pooling_mode: Optional[str] = None
|
|
12
|
+
max_seq_length: int = 512 # max sequence length
|
|
13
|
+
# prompt for llm based model
|
|
14
|
+
prompt: str = ""
|
|
15
|
+
# model kwargs
|
|
16
|
+
model_kwargs: dict = field(default_factory=dict)
|
|
17
|
+
# config kwargs
|
|
18
|
+
config_kwargs: Dict[str, Any] = field(default_factory=dict)
|
|
19
|
+
# encode kwargs
|
|
20
|
+
encode_kwargs: dict = field(
|
|
21
|
+
default_factory=lambda: {"show_progress_bar": True, "batch_size": 32}
|
|
22
|
+
)
|
|
23
|
+
hub: str = "modelscope" # modelscope or huggingface
|
|
24
|
+
|
|
25
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
26
|
+
return {
|
|
27
|
+
"model_name_or_path": self.model_name_or_path,
|
|
28
|
+
"is_cross_encoder": self.is_cross_encoder,
|
|
29
|
+
"pooling_mode": self.pooling_mode,
|
|
30
|
+
"max_seq_length": self.max_seq_length,
|
|
31
|
+
"prompt": self.prompt,
|
|
32
|
+
"model_kwargs": self.model_kwargs,
|
|
33
|
+
"config_kwargs": self.config_kwargs,
|
|
34
|
+
"encode_kwargs": self.encode_kwargs,
|
|
35
|
+
"hub": self.hub,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class EvalArguments:
|
|
41
|
+
# Evaluation
|
|
42
|
+
tasks: List[str] = field(default_factory=list) # task names
|
|
43
|
+
dataset_path: Optional[str] = None # custom dataset path
|
|
44
|
+
verbosity: int = 2 # verbosity level 0-3
|
|
45
|
+
output_folder: str = "outputs" # output folder
|
|
46
|
+
overwrite_results: bool = True # overwrite results
|
|
47
|
+
limits: Optional[int] = None # limit number of samples
|
|
48
|
+
hub: str = "modelscope" # modelscope or huggingface
|
|
49
|
+
top_k: int = 5 # top k for reranking
|
|
50
|
+
|
|
51
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
52
|
+
return {
|
|
53
|
+
"tasks": self.tasks,
|
|
54
|
+
"dataset_path": self.dataset_path,
|
|
55
|
+
"verbosity": self.verbosity,
|
|
56
|
+
"output_folder": self.output_folder,
|
|
57
|
+
"overwrite_results": self.overwrite_results,
|
|
58
|
+
"limits": self.limits,
|
|
59
|
+
"hub": self.hub,
|
|
60
|
+
"top_k": self.top_k,
|
|
61
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import List
|
|
3
|
+
from mteb import AbsTask
|
|
4
|
+
from datasets import DatasetDict
|
|
5
|
+
from modelscope import MsDataset
|
|
6
|
+
import datasets
|
|
7
|
+
from evalscope.backend.rag_eval.cmteb.tasks import CLS_DICT, CLS_RETRIEVAL, CLS_CUSTOM
|
|
8
|
+
|
|
9
|
+
__all__ = ["TaskBase"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TaskBase:
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def get_tasks(task_names, **kwargs) -> List[AbsTask]:
|
|
16
|
+
|
|
17
|
+
return [TaskBase.get_task(task_name, **kwargs) for task_name in task_names]
|
|
18
|
+
|
|
19
|
+
@staticmethod
|
|
20
|
+
def get_task(task_name, **kwargs) -> AbsTask:
|
|
21
|
+
|
|
22
|
+
if task_name in CLS_CUSTOM:
|
|
23
|
+
task_cls = CLS_CUSTOM[task_name]
|
|
24
|
+
elif task_name in CLS_DICT:
|
|
25
|
+
task_cls = CLS_DICT[task_name]
|
|
26
|
+
task_cls.load_data = load_data
|
|
27
|
+
else:
|
|
28
|
+
from mteb.overview import TASKS_REGISTRY
|
|
29
|
+
|
|
30
|
+
task_cls = TASKS_REGISTRY[task_name]
|
|
31
|
+
if task_cls.metadata.type != "Retrieval":
|
|
32
|
+
task_cls.load_data = load_data
|
|
33
|
+
|
|
34
|
+
# init task instance
|
|
35
|
+
task_instance = task_cls(**kwargs)
|
|
36
|
+
return task_instance
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def load_data(self, **kwargs):
|
|
40
|
+
"""Load dataset from the hub, compatible with ModelScope and Hugging Face."""
|
|
41
|
+
if self.data_loaded:
|
|
42
|
+
return
|
|
43
|
+
|
|
44
|
+
limits = kwargs.get("limits", None)
|
|
45
|
+
hub = kwargs.get("hub", "modelscope")
|
|
46
|
+
name = self.metadata_dict.get("name")
|
|
47
|
+
path = self.metadata_dict["dataset"].get("path")
|
|
48
|
+
|
|
49
|
+
assert path is not None, "Path must be specified in dataset"
|
|
50
|
+
|
|
51
|
+
# Loading the dataset based on the source hub
|
|
52
|
+
if hub == "modelscope":
|
|
53
|
+
import re
|
|
54
|
+
|
|
55
|
+
path = re.sub(r"^mteb/", "MTEB/", path)
|
|
56
|
+
dataset = MsDataset.load(path)
|
|
57
|
+
else:
|
|
58
|
+
dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) # type: ignore
|
|
59
|
+
|
|
60
|
+
if limits is not None:
|
|
61
|
+
dataset = {
|
|
62
|
+
split: dataset[split].select(range(min(limits, len(dataset[split]))))
|
|
63
|
+
for split in dataset.keys()
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if name in CLS_RETRIEVAL:
|
|
67
|
+
self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
|
|
68
|
+
dataset,
|
|
69
|
+
path,
|
|
70
|
+
self.metadata_dict["eval_splits"],
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
self.dataset = dataset
|
|
74
|
+
self.dataset_transform()
|
|
75
|
+
self.data_loaded = True
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def load_retrieval_data(dataset, dataset_name: str, eval_splits: list) -> tuple:
|
|
79
|
+
eval_split = eval_splits[0]
|
|
80
|
+
qrels = MsDataset.load(dataset_name + "-qrels")[eval_split]
|
|
81
|
+
|
|
82
|
+
corpus = {e["id"]: {"text": e["text"]} for e in dataset["corpus"]}
|
|
83
|
+
queries = {e["id"]: e["text"] for e in dataset["queries"]}
|
|
84
|
+
relevant_docs = defaultdict(dict)
|
|
85
|
+
for e in qrels:
|
|
86
|
+
relevant_docs[e["qid"]][e["pid"]] = e["score"]
|
|
87
|
+
|
|
88
|
+
corpus = DatasetDict({eval_split: corpus})
|
|
89
|
+
queries = DatasetDict({eval_split: queries})
|
|
90
|
+
relevant_docs = DatasetDict({eval_split: relevant_docs})
|
|
91
|
+
return corpus, queries, relevant_docs
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import mteb
|
|
3
|
+
from evalscope.backend.rag_eval import EmbeddingModel
|
|
4
|
+
from evalscope.backend.rag_eval import cmteb
|
|
5
|
+
from mteb.task_selection import results_to_dataframe
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def show_results(output_folder, model, results):
|
|
12
|
+
model_name = model.mteb_model_meta.model_name_as_path()
|
|
13
|
+
revision = model.mteb_model_meta.revision
|
|
14
|
+
|
|
15
|
+
results_df = results_to_dataframe({model_name: {revision: results}})
|
|
16
|
+
|
|
17
|
+
save_path = os.path.join(
|
|
18
|
+
output_folder,
|
|
19
|
+
model_name,
|
|
20
|
+
revision,
|
|
21
|
+
)
|
|
22
|
+
logger.info(f"Evaluation results:\n{results_df.to_markdown()}")
|
|
23
|
+
logger.info(f"Evaluation results saved in {os.path.abspath(save_path)}")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def one_stage_eval(
|
|
27
|
+
model_args,
|
|
28
|
+
eval_args,
|
|
29
|
+
) -> None:
|
|
30
|
+
# load model
|
|
31
|
+
model = EmbeddingModel.load(**model_args)
|
|
32
|
+
custom_dataset_path = eval_args.pop("dataset_path", None)
|
|
33
|
+
# load task first to update instructions
|
|
34
|
+
tasks = cmteb.TaskBase.get_tasks(
|
|
35
|
+
task_names=eval_args["tasks"], dataset_path=custom_dataset_path
|
|
36
|
+
)
|
|
37
|
+
evaluation = mteb.MTEB(tasks=tasks)
|
|
38
|
+
|
|
39
|
+
# run evaluation
|
|
40
|
+
results = evaluation.run(model, **eval_args)
|
|
41
|
+
|
|
42
|
+
# save and log results
|
|
43
|
+
show_results(eval_args["output_folder"], model, results)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def two_stage_eval(
|
|
47
|
+
model1_args,
|
|
48
|
+
model2_args,
|
|
49
|
+
eval_args,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""a two-stage run with the second stage reading results saved from the first stage."""
|
|
52
|
+
# load model
|
|
53
|
+
dual_encoder = EmbeddingModel.load(**model1_args)
|
|
54
|
+
cross_encoder = EmbeddingModel.load(**model2_args)
|
|
55
|
+
|
|
56
|
+
first_stage_path = f"{eval_args['output_folder']}/stage1"
|
|
57
|
+
second_stage_path = f"{eval_args['output_folder']}/stage2"
|
|
58
|
+
|
|
59
|
+
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args["tasks"])
|
|
60
|
+
for task in tasks:
|
|
61
|
+
evaluation = mteb.MTEB(tasks=[task])
|
|
62
|
+
|
|
63
|
+
# stage 1: run dual encoder
|
|
64
|
+
evaluation.run(
|
|
65
|
+
dual_encoder,
|
|
66
|
+
save_predictions=True,
|
|
67
|
+
output_folder=first_stage_path,
|
|
68
|
+
overwrite_results=True,
|
|
69
|
+
hub=eval_args["hub"],
|
|
70
|
+
limits=eval_args["limits"],
|
|
71
|
+
)
|
|
72
|
+
# stage 2: run cross encoder
|
|
73
|
+
results = evaluation.run(
|
|
74
|
+
cross_encoder,
|
|
75
|
+
top_k=eval_args["top_k"],
|
|
76
|
+
save_predictions=True,
|
|
77
|
+
output_folder=second_stage_path,
|
|
78
|
+
previous_results=f"{first_stage_path}/{task.metadata.name}_default_predictions.json",
|
|
79
|
+
overwrite_results=True,
|
|
80
|
+
hub=eval_args["hub"],
|
|
81
|
+
limits=eval_args["limits"],
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# save and log results
|
|
85
|
+
show_results(second_stage_path, cross_encoder, results)
|