evalscope 0.5.5rc0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (49) hide show
  1. evalscope/backend/__init__.py +0 -3
  2. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
  3. evalscope/backend/rag_eval/__init__.py +4 -0
  4. evalscope/backend/rag_eval/backend_manager.py +80 -0
  5. evalscope/backend/rag_eval/clip_benchmark/__init__.py +2 -0
  6. evalscope/backend/rag_eval/clip_benchmark/arguments.py +34 -0
  7. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +277 -0
  8. evalscope/backend/rag_eval/clip_benchmark/task_template.py +119 -0
  9. evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  10. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +83 -0
  11. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +247 -0
  12. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +170 -0
  13. evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
  14. evalscope/backend/rag_eval/cmteb/arguments.py +61 -0
  15. evalscope/backend/rag_eval/cmteb/base.py +91 -0
  16. evalscope/backend/rag_eval/cmteb/task_template.py +85 -0
  17. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
  18. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
  19. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +61 -0
  20. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
  21. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +151 -0
  22. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
  23. evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
  24. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +70 -0
  25. evalscope/backend/rag_eval/ragas/__init__.py +2 -0
  26. evalscope/backend/rag_eval/ragas/arguments.py +47 -0
  27. evalscope/backend/rag_eval/ragas/metrics/__init__.py +2 -0
  28. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +91 -0
  29. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +99 -0
  30. evalscope/backend/rag_eval/ragas/task_template.py +61 -0
  31. evalscope/backend/rag_eval/ragas/tasks/__init__.py +2 -0
  32. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +263 -0
  33. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +72 -0
  34. evalscope/backend/vlm_eval_kit/backend_manager.py +0 -1
  35. evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
  36. evalscope/evaluator/evaluator.py +1 -0
  37. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +19 -0
  38. evalscope/models/api/openai_api.py +2 -2
  39. evalscope/perf/http_client.py +1 -1
  40. evalscope/perf/openai_api.py +2 -0
  41. evalscope/run.py +4 -0
  42. evalscope/utils/logger.py +44 -14
  43. evalscope/utils/task_utils.py +3 -0
  44. evalscope/version.py +2 -2
  45. {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/METADATA +95 -99
  46. {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/RECORD +49 -18
  47. {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/WHEEL +1 -1
  48. {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/entry_points.txt +0 -0
  49. {evalscope-0.5.5rc0.dist-info → evalscope-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,247 @@
1
+ """
2
+ Code adapated from https://github.com/mlfoundations/open_clip/blob/main/src/training/zero_shot.py
3
+ Thanks to the authors of OpenCLIP
4
+ """
5
+
6
+ import logging
7
+ from contextlib import suppress
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from tqdm import tqdm
12
+
13
+ from sklearn.metrics import classification_report, balanced_accuracy_score
14
+
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ logger = get_logger()
18
+
19
+
20
+ def zero_shot_classifier(model, classnames, templates, device, amp=True):
21
+ """
22
+ This function returns zero-shot vectors for each class in order
23
+ to use it for zero-shot classification.
24
+
25
+
26
+ model:
27
+ CLIP-like model with `encode_text`
28
+
29
+ classnames: list of str
30
+ name of classes
31
+
32
+ templates: list of str
33
+ templates to use.
34
+
35
+ Returns
36
+ -------
37
+
38
+ torch.Tensor of shape (N,C) where N is the number
39
+ of templates, and C is the number of classes.
40
+ """
41
+ autocast = torch.amp.autocast if amp else suppress
42
+ with torch.no_grad(), autocast(device):
43
+ zeroshot_weights = []
44
+ for classname in tqdm(classnames):
45
+ if type(templates) == dict:
46
+ # class-specific prompts (e.g., CuPL https://arxiv.org/abs/2209.03320)
47
+ texts = templates[classname]
48
+ elif type(templates) == list:
49
+ # generic prompts tht are specialized for each class by replacing {c} with the class name
50
+ texts = [template.format(c=classname) for template in templates]
51
+ else:
52
+ raise ValueError("templates must be a list or a dict")
53
+ class_embedding = model.encode_text(texts).mean(dim=0)
54
+ class_embedding = F.normalize(class_embedding, dim=0)
55
+ zeroshot_weights.append(class_embedding)
56
+ zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(device)
57
+ return zeroshot_weights
58
+
59
+
60
+ def accuracy(output, target, topk=(1,)):
61
+ """
62
+ Compute top-k accuracy
63
+
64
+ output: torch.Tensor
65
+ shape (N, C) where N is the number of examples, C the number of classes.
66
+ these are the logits.
67
+
68
+ target: torch.Tensor
69
+ shape (N,) where N is the number of examples. Groundtruth class id of each example.
70
+
71
+ topk: tuple
72
+ which topk to compute, e.g., topk=(1,5) will compute top-1 and top-5 accuracies
73
+
74
+ Returns
75
+ -------
76
+
77
+ list of top-k accuracies in the same order as `topk`
78
+ """
79
+ pred = output.topk(max(topk), 1, True, True)[1].t()
80
+ correct = pred.eq(target.view(1, -1).expand_as(pred))
81
+ n = len(target)
82
+ return [
83
+ float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) / n
84
+ for k in topk
85
+ ]
86
+
87
+
88
+ def run_classification(model, classifier, dataloader, device, amp=True, limit=None):
89
+ """
90
+ Run zero-shot classifcation
91
+
92
+ model: torch.nn.Module
93
+ CLIP-like model with `encode_image` and `encode_text`
94
+
95
+ classifier: torch.Tensor
96
+ obtained from the function `zero_shot_classifier`
97
+
98
+ dataloader: torch.utils.data.Dataloader
99
+
100
+ Returns
101
+ -------
102
+ (pred, true) where
103
+ - pred (N, C) are the logits
104
+ - true (N,) are the actual classes
105
+ """
106
+ autocast = torch.amp.autocast if amp else suppress
107
+ pred = []
108
+ true = []
109
+ sample_count = 0
110
+ with torch.no_grad():
111
+ for images, target in tqdm(dataloader):
112
+ target = target.to(device)
113
+
114
+ with autocast(device):
115
+ # predict
116
+ image_features = model.encode_image(images)
117
+ logits = 100.0 * image_features @ classifier
118
+
119
+ if limit is not None:
120
+ # Update sample counter
121
+ sample_count += len(images)
122
+
123
+ if sample_count >= limit:
124
+ break
125
+
126
+ true.append(target.cpu())
127
+ pred.append(logits.float().cpu())
128
+
129
+ pred = torch.cat(pred)
130
+ true = torch.cat(true)
131
+ return pred, true
132
+
133
+
134
+ def average_precision_per_class(scores, targets):
135
+ """
136
+ Compute average precision for each class
137
+ this metric is used for multi-label classification
138
+ see explanations here https://fangdahan.medium.com/calculate-mean-average-precision-map-for-multi-label-classification-b082679d31be
139
+ Code is adapted from https://github.com/pytorch/tnt/blob/master/torchnet/meter/meter.py, thanks to the authors of `tnt`.
140
+
141
+ Parameters
142
+ ----------
143
+
144
+ scores: torch.Tensor
145
+ logits, of shape (N,C) where N is the number of examples, C the number of classes
146
+
147
+ targets: torch.Tensor
148
+ one-hot vectors of groundtruth targets (N, C), where N is the number of examples, C is the
149
+ number of classes
150
+
151
+ Returns
152
+ -------
153
+
154
+ torch.Tensor of shape (C,) of avereage precision for each class, where C is
155
+ the number of classes.
156
+
157
+ """
158
+ ap = torch.zeros(scores.size(1))
159
+ rg = torch.arange(1, scores.size(0) + 1).float()
160
+ # compute average precision for each class
161
+ for k in range(scores.size(1)):
162
+ # sort scores
163
+ scores_k = scores[:, k]
164
+ targets_k = targets[:, k]
165
+ _, sortind = torch.sort(scores_k, 0, True)
166
+ truth = targets_k[sortind]
167
+ tp = truth.float().cumsum(0)
168
+ # compute precision curve
169
+ precision = tp.div(rg)
170
+ # compute average precision
171
+ ap[k] = precision[truth.bool()].sum() / max(float(truth.sum()), 1)
172
+ return ap
173
+
174
+
175
+ def evaluate(
176
+ model,
177
+ dataloader,
178
+ classnames,
179
+ templates,
180
+ device,
181
+ amp=True,
182
+ verbose=False,
183
+ limit=None,
184
+ ):
185
+ """
186
+ Run zero-shot classification and evaluate the metrics
187
+
188
+ Parameters
189
+ ----------
190
+
191
+ model: torch.nn.Module
192
+ CLIP-like model with `encode_image` and `encode_text`
193
+
194
+ dataloader: torch.utils.data.Dataloader
195
+
196
+ classnames: list of str
197
+ class names
198
+
199
+ templates: list of str
200
+ templates to use for zero-shot classification
201
+
202
+ device: cpu/cuda
203
+
204
+ amp: whether to use automatic mixed precision
205
+
206
+ verbose: whether to use verbose model
207
+
208
+ Returns
209
+ -------
210
+
211
+ dict of classification metrics
212
+ """
213
+ classifier = zero_shot_classifier(model, classnames, templates, device, amp=amp)
214
+
215
+ logits, target = run_classification(model, classifier, dataloader, device, amp=amp, limit=limit)
216
+ is_multilabel = len(target.shape) == 2
217
+
218
+ if is_multilabel:
219
+ if verbose:
220
+ logger.info("Detected a multi-label classification dataset")
221
+ # Multiple labels per image, multiple classes on the dataset
222
+ ap_per_class = average_precision_per_class(logits, target)
223
+ if verbose:
224
+ for class_name, ap in zip(
225
+ dataloader.dataset.classes, ap_per_class.tolist()
226
+ ):
227
+ logger.info(f"Class: {class_name}, AveragePrecision: {ap}")
228
+ return {"mean_average_precision": ap_per_class.mean().item()}
229
+ else:
230
+ # Single label per image, multiple classes on the dataset
231
+ # just compute accuracy and mean_per_class_recall
232
+
233
+ pred = logits.argmax(axis=1)
234
+ # measure accuracy
235
+ if len(dataloader.dataset.classes) >= 5:
236
+ acc1, acc5 = accuracy(logits, target, topk=(1, 5))
237
+ else:
238
+ (acc1,) = accuracy(logits, target, topk=(1,))
239
+ acc5 = float("nan")
240
+ mean_per_class_recall = balanced_accuracy_score(target, pred)
241
+ if verbose:
242
+ logger.info("\n" + classification_report(target, pred, digits=3))
243
+ return {
244
+ "acc1": acc1,
245
+ "acc5": acc5,
246
+ "mean_per_class_recall": mean_per_class_recall,
247
+ }
@@ -0,0 +1,170 @@
1
+ import logging
2
+ from contextlib import suppress
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from tqdm import tqdm
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ def evaluate(model, dataloader, device, amp=True, recall_k_list=[5], limit=None):
13
+ """
14
+ Evaluate the model on the given dataset
15
+
16
+ Parameters
17
+ ----------
18
+
19
+ model: torch.nn.Module
20
+ CLIP-like model with `encode_image` and `encode_text`
21
+
22
+ dataloader: torch.utils.data.Dataloader
23
+ dataloader to use for evaluation
24
+
25
+ tokenizer:
26
+ text tokenizer, i.e. convert list of strings to torch.Tensor of integers
27
+
28
+ device: cpu/cuda
29
+
30
+ amp: whether to use automatic mixed precision
31
+
32
+ recall_k_list: list of int
33
+ recall@k k's to use
34
+
35
+ limit: int
36
+ maximum number of samples to evaluate
37
+
38
+ Returns
39
+ -------
40
+
41
+ dict of retrieval metrics
42
+ """
43
+ # list of batch of images embedding
44
+ batch_images_emb_list = []
45
+ # list of batch of text embedding
46
+ batch_texts_emb_list = []
47
+ # for each text, we collect the corresponding image index, as each image can have multiple corresponding texts
48
+ texts_image_index = []
49
+ sample_count = 0
50
+ dataloader = dataloader_with_indices(dataloader)
51
+ for batch_images, batch_texts, inds in tqdm(dataloader):
52
+
53
+ # store the index of image for each text
54
+ batch_texts_image_index = [
55
+ ind for ind, texts in zip(inds, batch_texts) for text in texts
56
+ ]
57
+
58
+ # compute the embedding of images and texts
59
+ batch_images_emb = model.encode_image(batch_images)
60
+ batch_texts_emb = model.encode_text(batch_texts)
61
+
62
+ batch_images_emb_list.append(batch_images_emb.cpu())
63
+ batch_texts_emb_list.append(batch_texts_emb.cpu())
64
+ texts_image_index.extend(batch_texts_image_index)
65
+
66
+ if limit is not None:
67
+ # Update sample counter
68
+ sample_count += len(batch_images)
69
+
70
+ if sample_count >= limit:
71
+ break
72
+
73
+ batch_size = len(batch_images_emb_list[0])
74
+
75
+ # concatenate all embeddings
76
+ images_emb = torch.cat(batch_images_emb_list)
77
+ texts_emb = torch.cat(batch_texts_emb_list)
78
+
79
+ # get the score for each text and image pair
80
+ scores = texts_emb @ images_emb.t()
81
+
82
+ # construct a the positive pair matrix, which tells whether each text-image pair is a positive or not
83
+ positive_pairs = torch.zeros_like(scores, dtype=bool)
84
+ positive_pairs[torch.arange(len(scores)), texts_image_index] = True
85
+ metrics = {}
86
+ for recall_k in recall_k_list:
87
+ # Note that recall_at_k computes **actual** recall i.e. nb_true_positive/nb_positives, where the number
88
+ # of true positives, e.g. for text retrieval, is, for each image, the number of retrieved texts matching that image among the top-k.
89
+ # Also, the number of positives are the total number of texts matching the image in the dataset, as we have a set of captions
90
+ # for each image, that number will be greater than 1 for text retrieval.
91
+ # However, image/text retrieval recall@k, the way it is done in CLIP-like papers, is a bit different.
92
+ # recall@k, in CLIP-like papers, is, for each image, either 1 or 0. It is 1 if atleast one text matches the image among the top-k.
93
+ # so we can easily compute that using the actual recall, by checking whether there is at least one true positive,
94
+ # which would be the case if the recall is greater than 0. One we compute the recal for each image (or text), we average
95
+ # it over the dataset.
96
+ metrics[f"image_retrieval_recall@{recall_k}"] = (
97
+ (
98
+ batchify(
99
+ recall_at_k, scores, positive_pairs, batch_size, device, k=recall_k
100
+ )
101
+ > 0
102
+ )
103
+ .float()
104
+ .mean()
105
+ .item()
106
+ )
107
+ metrics[f"text_retrieval_recall@{recall_k}"] = (
108
+ (
109
+ batchify(
110
+ recall_at_k,
111
+ scores.T,
112
+ positive_pairs.T,
113
+ batch_size,
114
+ device,
115
+ k=recall_k,
116
+ )
117
+ > 0
118
+ )
119
+ .float()
120
+ .mean()
121
+ .item()
122
+ )
123
+
124
+ return metrics
125
+
126
+
127
+ def dataloader_with_indices(dataloader):
128
+ start = 0
129
+ for x, y in dataloader:
130
+ end = start + len(x)
131
+ inds = torch.arange(start, end)
132
+ yield x, y, inds
133
+ start = end
134
+
135
+
136
+ def recall_at_k(scores, positive_pairs, k):
137
+ """
138
+ Compute the recall at k for each sample
139
+ :param scores: compability score between text and image embeddings (nb texts, nb images)
140
+ :param k: number of images to consider per text, for retrieval
141
+ :param positive_pairs: boolean matrix of positive pairs (nb texts, nb images)
142
+ :return: recall at k averaged over all texts
143
+ """
144
+ nb_texts, nb_images = scores.shape
145
+ # for each text, sort according to image scores in decreasing order
146
+ topk_indices = torch.topk(scores, k, dim=1)[1]
147
+ # compute number of positives for each text
148
+ nb_positive = positive_pairs.sum(dim=1)
149
+ # nb_texts, k, nb_images
150
+ topk_indices_onehot = torch.nn.functional.one_hot(
151
+ topk_indices, num_classes=nb_images
152
+ )
153
+ # compute number of true positives
154
+ positive_pairs_reshaped = positive_pairs.view(nb_texts, 1, nb_images)
155
+ # a true positive means a positive among the topk
156
+ nb_true_positive = (topk_indices_onehot * positive_pairs_reshaped).sum(dim=(1, 2))
157
+ # compute recall at k
158
+ recall_at_k = nb_true_positive / nb_positive
159
+ return recall_at_k
160
+
161
+
162
+ def batchify(func, X, Y, batch_size, device, *args, **kwargs):
163
+ results = []
164
+ for start in range(0, len(X), batch_size):
165
+ end = start + batch_size
166
+ x = X[start:end].to(device)
167
+ y = Y[start:end].to(device)
168
+ result = func(x, y, *args, **kwargs).cpu()
169
+ results.append(result)
170
+ return torch.cat(results)
@@ -0,0 +1,4 @@
1
+ from evalscope.backend.rag_eval.cmteb.tasks import *
2
+ from evalscope.backend.rag_eval.cmteb.base import *
3
+ from evalscope.backend.rag_eval.cmteb.arguments import ModelArguments, EvalArguments
4
+ from evalscope.backend.rag_eval.cmteb.task_template import one_stage_eval, two_stage_eval
@@ -0,0 +1,61 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List, Optional, Union, Dict, Any
3
+
4
+
5
+ @dataclass
6
+ class ModelArguments:
7
+ # Arguments for embeding model: sentence transformer or cross encoder
8
+ model_name_or_path: str = "" # model name or path
9
+ is_cross_encoder: bool = False # whether the model is a cross encoder
10
+ # pooling mode: Either “cls”, “lasttoken”, “max”, “mean”, “mean_sqrt_len_tokens”, or “weightedmean”.
11
+ pooling_mode: Optional[str] = None
12
+ max_seq_length: int = 512 # max sequence length
13
+ # prompt for llm based model
14
+ prompt: str = ""
15
+ # model kwargs
16
+ model_kwargs: dict = field(default_factory=dict)
17
+ # config kwargs
18
+ config_kwargs: Dict[str, Any] = field(default_factory=dict)
19
+ # encode kwargs
20
+ encode_kwargs: dict = field(
21
+ default_factory=lambda: {"show_progress_bar": True, "batch_size": 32}
22
+ )
23
+ hub: str = "modelscope" # modelscope or huggingface
24
+
25
+ def to_dict(self) -> Dict[str, Any]:
26
+ return {
27
+ "model_name_or_path": self.model_name_or_path,
28
+ "is_cross_encoder": self.is_cross_encoder,
29
+ "pooling_mode": self.pooling_mode,
30
+ "max_seq_length": self.max_seq_length,
31
+ "prompt": self.prompt,
32
+ "model_kwargs": self.model_kwargs,
33
+ "config_kwargs": self.config_kwargs,
34
+ "encode_kwargs": self.encode_kwargs,
35
+ "hub": self.hub,
36
+ }
37
+
38
+
39
+ @dataclass
40
+ class EvalArguments:
41
+ # Evaluation
42
+ tasks: List[str] = field(default_factory=list) # task names
43
+ dataset_path: Optional[str] = None # custom dataset path
44
+ verbosity: int = 2 # verbosity level 0-3
45
+ output_folder: str = "outputs" # output folder
46
+ overwrite_results: bool = True # overwrite results
47
+ limits: Optional[int] = None # limit number of samples
48
+ hub: str = "modelscope" # modelscope or huggingface
49
+ top_k: int = 5 # top k for reranking
50
+
51
+ def to_dict(self) -> Dict[str, Any]:
52
+ return {
53
+ "tasks": self.tasks,
54
+ "dataset_path": self.dataset_path,
55
+ "verbosity": self.verbosity,
56
+ "output_folder": self.output_folder,
57
+ "overwrite_results": self.overwrite_results,
58
+ "limits": self.limits,
59
+ "hub": self.hub,
60
+ "top_k": self.top_k,
61
+ }
@@ -0,0 +1,91 @@
1
+ from collections import defaultdict
2
+ from typing import List
3
+ from mteb import AbsTask
4
+ from datasets import DatasetDict
5
+ from modelscope import MsDataset
6
+ import datasets
7
+ from evalscope.backend.rag_eval.cmteb.tasks import CLS_DICT, CLS_RETRIEVAL, CLS_CUSTOM
8
+
9
+ __all__ = ["TaskBase"]
10
+
11
+
12
+ class TaskBase:
13
+
14
+ @staticmethod
15
+ def get_tasks(task_names, **kwargs) -> List[AbsTask]:
16
+
17
+ return [TaskBase.get_task(task_name, **kwargs) for task_name in task_names]
18
+
19
+ @staticmethod
20
+ def get_task(task_name, **kwargs) -> AbsTask:
21
+
22
+ if task_name in CLS_CUSTOM:
23
+ task_cls = CLS_CUSTOM[task_name]
24
+ elif task_name in CLS_DICT:
25
+ task_cls = CLS_DICT[task_name]
26
+ task_cls.load_data = load_data
27
+ else:
28
+ from mteb.overview import TASKS_REGISTRY
29
+
30
+ task_cls = TASKS_REGISTRY[task_name]
31
+ if task_cls.metadata.type != "Retrieval":
32
+ task_cls.load_data = load_data
33
+
34
+ # init task instance
35
+ task_instance = task_cls(**kwargs)
36
+ return task_instance
37
+
38
+
39
+ def load_data(self, **kwargs):
40
+ """Load dataset from the hub, compatible with ModelScope and Hugging Face."""
41
+ if self.data_loaded:
42
+ return
43
+
44
+ limits = kwargs.get("limits", None)
45
+ hub = kwargs.get("hub", "modelscope")
46
+ name = self.metadata_dict.get("name")
47
+ path = self.metadata_dict["dataset"].get("path")
48
+
49
+ assert path is not None, "Path must be specified in dataset"
50
+
51
+ # Loading the dataset based on the source hub
52
+ if hub == "modelscope":
53
+ import re
54
+
55
+ path = re.sub(r"^mteb/", "MTEB/", path)
56
+ dataset = MsDataset.load(path)
57
+ else:
58
+ dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) # type: ignore
59
+
60
+ if limits is not None:
61
+ dataset = {
62
+ split: dataset[split].select(range(min(limits, len(dataset[split]))))
63
+ for split in dataset.keys()
64
+ }
65
+
66
+ if name in CLS_RETRIEVAL:
67
+ self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
68
+ dataset,
69
+ path,
70
+ self.metadata_dict["eval_splits"],
71
+ )
72
+
73
+ self.dataset = dataset
74
+ self.dataset_transform()
75
+ self.data_loaded = True
76
+
77
+
78
+ def load_retrieval_data(dataset, dataset_name: str, eval_splits: list) -> tuple:
79
+ eval_split = eval_splits[0]
80
+ qrels = MsDataset.load(dataset_name + "-qrels")[eval_split]
81
+
82
+ corpus = {e["id"]: {"text": e["text"]} for e in dataset["corpus"]}
83
+ queries = {e["id"]: e["text"] for e in dataset["queries"]}
84
+ relevant_docs = defaultdict(dict)
85
+ for e in qrels:
86
+ relevant_docs[e["qid"]][e["pid"]] = e["score"]
87
+
88
+ corpus = DatasetDict({eval_split: corpus})
89
+ queries = DatasetDict({eval_split: queries})
90
+ relevant_docs = DatasetDict({eval_split: relevant_docs})
91
+ return corpus, queries, relevant_docs
@@ -0,0 +1,85 @@
1
+ import os
2
+ import mteb
3
+ from evalscope.backend.rag_eval import EmbeddingModel
4
+ from evalscope.backend.rag_eval import cmteb
5
+ from mteb.task_selection import results_to_dataframe
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ def show_results(output_folder, model, results):
12
+ model_name = model.mteb_model_meta.model_name_as_path()
13
+ revision = model.mteb_model_meta.revision
14
+
15
+ results_df = results_to_dataframe({model_name: {revision: results}})
16
+
17
+ save_path = os.path.join(
18
+ output_folder,
19
+ model_name,
20
+ revision,
21
+ )
22
+ logger.info(f"Evaluation results:\n{results_df.to_markdown()}")
23
+ logger.info(f"Evaluation results saved in {os.path.abspath(save_path)}")
24
+
25
+
26
+ def one_stage_eval(
27
+ model_args,
28
+ eval_args,
29
+ ) -> None:
30
+ # load model
31
+ model = EmbeddingModel.load(**model_args)
32
+ custom_dataset_path = eval_args.pop("dataset_path", None)
33
+ # load task first to update instructions
34
+ tasks = cmteb.TaskBase.get_tasks(
35
+ task_names=eval_args["tasks"], dataset_path=custom_dataset_path
36
+ )
37
+ evaluation = mteb.MTEB(tasks=tasks)
38
+
39
+ # run evaluation
40
+ results = evaluation.run(model, **eval_args)
41
+
42
+ # save and log results
43
+ show_results(eval_args["output_folder"], model, results)
44
+
45
+
46
+ def two_stage_eval(
47
+ model1_args,
48
+ model2_args,
49
+ eval_args,
50
+ ) -> None:
51
+ """a two-stage run with the second stage reading results saved from the first stage."""
52
+ # load model
53
+ dual_encoder = EmbeddingModel.load(**model1_args)
54
+ cross_encoder = EmbeddingModel.load(**model2_args)
55
+
56
+ first_stage_path = f"{eval_args['output_folder']}/stage1"
57
+ second_stage_path = f"{eval_args['output_folder']}/stage2"
58
+
59
+ tasks = cmteb.TaskBase.get_tasks(task_names=eval_args["tasks"])
60
+ for task in tasks:
61
+ evaluation = mteb.MTEB(tasks=[task])
62
+
63
+ # stage 1: run dual encoder
64
+ evaluation.run(
65
+ dual_encoder,
66
+ save_predictions=True,
67
+ output_folder=first_stage_path,
68
+ overwrite_results=True,
69
+ hub=eval_args["hub"],
70
+ limits=eval_args["limits"],
71
+ )
72
+ # stage 2: run cross encoder
73
+ results = evaluation.run(
74
+ cross_encoder,
75
+ top_k=eval_args["top_k"],
76
+ save_predictions=True,
77
+ output_folder=second_stage_path,
78
+ previous_results=f"{first_stage_path}/{task.metadata.name}_default_predictions.json",
79
+ overwrite_results=True,
80
+ hub=eval_args["hub"],
81
+ limits=eval_args["limits"],
82
+ )
83
+
84
+ # save and log results
85
+ show_results(second_stage_path, cross_encoder, results)