mteb 2.3.7__py3-none-any.whl → 2.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_evaluators/classification_metrics.py +54 -0
- mteb/abstasks/_data_filter/__init__.py +0 -0
- mteb/abstasks/_data_filter/filters.py +125 -0
- mteb/abstasks/_data_filter/task_pipelines.py +102 -0
- mteb/abstasks/multilabel_classification.py +8 -1
- mteb/benchmarks/benchmarks/benchmarks.py +2 -2
- mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json +61 -0
- mteb/leaderboard/benchmark_selector.py +7 -2
- mteb/models/model_implementations/e5_models.py +3 -101
- mteb/models/model_implementations/facebookai.py +147 -0
- mteb/models/model_implementations/kblab.py +24 -0
- mteb/models/model_implementations/kfst.py +24 -0
- mteb/models/model_implementations/nbailab.py +67 -0
- mteb/tasks/bitext_mining/multilingual/__init__.py +2 -1
- mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py +46 -0
- {mteb-2.3.7.dist-info → mteb-2.3.9.dist-info}/METADATA +1 -1
- {mteb-2.3.7.dist-info → mteb-2.3.9.dist-info}/RECORD +21 -13
- mteb/models/model_implementations/nb_sbert.py +0 -25
- {mteb-2.3.7.dist-info → mteb-2.3.9.dist-info}/WHEEL +0 -0
- {mteb-2.3.7.dist-info → mteb-2.3.9.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.7.dist-info → mteb-2.3.9.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.7.dist-info → mteb-2.3.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def hamming_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
5
|
+
"""Compute the Hamming score (a.k.a. label-based accuracy) for multilabel classification.
|
|
6
|
+
|
|
7
|
+
The Hamming score is the fraction of labels that are correctly predicted for each sample,
|
|
8
|
+
averaged over all samples. For samples where both y_true and y_pred have no labels,
|
|
9
|
+
the score is 1.0 (perfect agreement).
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
y_true: Binary matrix of true labels with shape (n_samples, n_labels)
|
|
13
|
+
y_pred: Binary matrix of predicted labels with shape (n_samples, n_labels)
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
float: Hamming score between 0.0 and 1.0
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ValueError: If inputs are invalid or have incompatible shapes
|
|
20
|
+
TypeError: If inputs cannot be converted to numpy arrays
|
|
21
|
+
"""
|
|
22
|
+
y_true = np.asarray(y_true)
|
|
23
|
+
y_pred = np.asarray(y_pred)
|
|
24
|
+
|
|
25
|
+
# Check shapes
|
|
26
|
+
if y_true.shape != y_pred.shape:
|
|
27
|
+
raise ValueError(
|
|
28
|
+
f"Shape mismatch: y_true {y_true.shape} != y_pred {y_pred.shape}"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Check if arrays are empty
|
|
32
|
+
if y_true.size == 0:
|
|
33
|
+
raise ValueError("Input arrays cannot be empty")
|
|
34
|
+
|
|
35
|
+
# Ensure 2D arrays
|
|
36
|
+
if y_true.ndim != 2:
|
|
37
|
+
raise ValueError(f"Arrays must be 2D, got {y_true.ndim}D")
|
|
38
|
+
|
|
39
|
+
# Check for binary values
|
|
40
|
+
if not (np.all(np.isin(y_true, [0, 1])) and np.all(np.isin(y_pred, [0, 1]))):
|
|
41
|
+
raise ValueError("Arrays must contain only binary values (0 and 1)")
|
|
42
|
+
|
|
43
|
+
# Convert to boolean for bitwise operations
|
|
44
|
+
y_true_bool = y_true.astype(bool)
|
|
45
|
+
y_pred_bool = y_pred.astype(bool)
|
|
46
|
+
|
|
47
|
+
# Calculate intersection and union for each sample
|
|
48
|
+
intersection = (y_true_bool & y_pred_bool).sum(axis=1)
|
|
49
|
+
union = (y_true_bool | y_pred_bool).sum(axis=1)
|
|
50
|
+
|
|
51
|
+
# Handle division by zero: when union is 0, both are all zeros, so score is 1.0
|
|
52
|
+
scores = np.where(union == 0, 1.0, intersection / union)
|
|
53
|
+
|
|
54
|
+
return float(scores.mean())
|
|
File without changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Simplified version of https://gist.github.com/AlexeyVatolin/ea3adc21aa7a767603ff393b22085adc from https://github.com/embeddings-benchmark/mteb/pull/2900"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import datasets
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from datasets import Dataset, DatasetDict
|
|
8
|
+
|
|
9
|
+
from mteb import TaskMetadata
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def deduplicate(dataset: Dataset, input_column: str) -> Dataset:
|
|
15
|
+
"""Remove duplicate texts, keeping the first occurrence."""
|
|
16
|
+
unique_texts = set()
|
|
17
|
+
indices_to_keep = []
|
|
18
|
+
for i, text in enumerate(dataset[input_column]):
|
|
19
|
+
text = text.strip()
|
|
20
|
+
if text not in unique_texts:
|
|
21
|
+
unique_texts.add(text)
|
|
22
|
+
indices_to_keep.append(i)
|
|
23
|
+
|
|
24
|
+
logger.info(
|
|
25
|
+
f"[deduplicate] removed={len(dataset) - len(indices_to_keep)}/{len(dataset)}"
|
|
26
|
+
)
|
|
27
|
+
return dataset.select(indices_to_keep)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def filter_empty(dataset: Dataset, input_column: str) -> Dataset:
|
|
31
|
+
"""Filter out empty or whitespace-only examples."""
|
|
32
|
+
before = len(dataset)
|
|
33
|
+
ds = dataset.filter(lambda x: len(x[input_column].strip()) > 0)
|
|
34
|
+
logger.info(f"[filter_empty] removed={before - len(ds)}/{before}")
|
|
35
|
+
return ds
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def filter_train_leakage(
|
|
39
|
+
train_dataset: Dataset, test_dataset: Dataset, input_column: str
|
|
40
|
+
) -> Dataset:
|
|
41
|
+
"""Remove test examples that appear in training."""
|
|
42
|
+
train_texts = set(train_dataset[input_column])
|
|
43
|
+
before = len(test_dataset)
|
|
44
|
+
indices = [
|
|
45
|
+
i
|
|
46
|
+
for i, text in enumerate(test_dataset[input_column])
|
|
47
|
+
if text not in train_texts
|
|
48
|
+
]
|
|
49
|
+
logger.info(f"[filter_train_leakage] removed={before - len(indices)}/{before}")
|
|
50
|
+
return test_dataset.select(indices)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def filter_unclear_label(
|
|
54
|
+
dataset_dict: DatasetDict, input_column: str, label_column: str
|
|
55
|
+
) -> DatasetDict:
|
|
56
|
+
"""Remove examples where the same text appears with multiple different labels."""
|
|
57
|
+
normalized: dict[str, set[str | tuple[str, ...]]] = {}
|
|
58
|
+
logger.debug("[filter_controversial] scanning dataset for label conflicts...")
|
|
59
|
+
|
|
60
|
+
for split, ds in dataset_dict.items():
|
|
61
|
+
for text, label in zip(ds[input_column], ds[label_column]):
|
|
62
|
+
key = text.strip().lower()
|
|
63
|
+
normalized.setdefault(key, set()).add(
|
|
64
|
+
label if isinstance(label, (str, int, float)) else tuple(label)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
|
|
68
|
+
logger.info(f"[filter_controversial] Removing {len(bad_texts)} conflicting texts")
|
|
69
|
+
|
|
70
|
+
new_dict = {}
|
|
71
|
+
for split, ds in dataset_dict.items():
|
|
72
|
+
before = len(ds)
|
|
73
|
+
filtered = ds.filter(lambda x: x[input_column].strip().lower() not in bad_texts)
|
|
74
|
+
logger.debug(
|
|
75
|
+
f"[filter_controversial:{split}] removed={before - len(filtered)}/{before}"
|
|
76
|
+
)
|
|
77
|
+
new_dict[split] = filtered
|
|
78
|
+
|
|
79
|
+
return DatasetDict(new_dict)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def filter_short(dataset: Dataset, input_column: str, min_words: int = 3) -> Dataset:
|
|
83
|
+
"""Filter out texts with fewer than `min_words`."""
|
|
84
|
+
before = len(dataset)
|
|
85
|
+
ds = dataset.filter(lambda x: len(x[input_column].strip().split()) >= min_words)
|
|
86
|
+
logger.debug(f"[filter_short] removed={before - len(ds)}/{before}")
|
|
87
|
+
return ds
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def split_train_test(
|
|
91
|
+
ds: DatasetDict,
|
|
92
|
+
metadata: TaskMetadata,
|
|
93
|
+
train_split: str,
|
|
94
|
+
label_column: str,
|
|
95
|
+
) -> DatasetDict:
|
|
96
|
+
if train_split in ds and metadata.eval_splits == train_split:
|
|
97
|
+
before = len(ds[train_split])
|
|
98
|
+
logger.info(
|
|
99
|
+
f"[split_train_test] eval_splits == train_split; performing split on {before} examples"
|
|
100
|
+
)
|
|
101
|
+
ds[train_split] = ds[train_split].cast_column(
|
|
102
|
+
label_column,
|
|
103
|
+
datasets.ClassLabel(names=list(set(ds[train_split][label_column]))),
|
|
104
|
+
)
|
|
105
|
+
label_counts = pd.Series(ds[train_split][label_column]).value_counts()
|
|
106
|
+
one_sample_labels = set(label_counts[label_counts == 1].index.tolist())
|
|
107
|
+
|
|
108
|
+
if one_sample_labels:
|
|
109
|
+
logger.info(
|
|
110
|
+
f"[split_train_test] Removing {len(one_sample_labels)} labels with only one instance"
|
|
111
|
+
)
|
|
112
|
+
ds[train_split] = ds[train_split].filter(
|
|
113
|
+
lambda x: x[label_column] not in one_sample_labels
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
splits = ds[train_split].train_test_split(
|
|
117
|
+
test_size=min(2048, before // 2), seed=42, stratify_by_column=label_column
|
|
118
|
+
)
|
|
119
|
+
ds = DatasetDict({train_split: splits[train_split], "test": splits["test"]})
|
|
120
|
+
metadata.eval_splits = ["test"]
|
|
121
|
+
logger.info(
|
|
122
|
+
f"[split_train_test] Train size={len(ds[train_split])}, Test size={len(ds['test'])}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return ds
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from datasets import DatasetDict
|
|
4
|
+
|
|
5
|
+
from mteb import TaskMetadata
|
|
6
|
+
from mteb.abstasks import AbsTaskClassification
|
|
7
|
+
from mteb.abstasks._data_filter.filters import (
|
|
8
|
+
deduplicate,
|
|
9
|
+
filter_empty,
|
|
10
|
+
filter_short,
|
|
11
|
+
filter_train_leakage,
|
|
12
|
+
filter_unclear_label,
|
|
13
|
+
split_train_test,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def clean_dataset(
|
|
20
|
+
ds: DatasetDict,
|
|
21
|
+
metadata: TaskMetadata,
|
|
22
|
+
train_split: str,
|
|
23
|
+
input_column: str,
|
|
24
|
+
label_column: str,
|
|
25
|
+
subset: str | None = None,
|
|
26
|
+
) -> DatasetDict:
|
|
27
|
+
"""Apply the full cleaning pipeline with logging."""
|
|
28
|
+
logger.info("[clean_dataset] Starting dataset cleaning pipeline...")
|
|
29
|
+
|
|
30
|
+
transforms = [
|
|
31
|
+
("filter_empty", filter_empty),
|
|
32
|
+
("deduplicate", deduplicate),
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
skip_cjk_codes = {"zho", "jpn", "tha", "mya", "cmn"}
|
|
36
|
+
logger.info("[clean_dataset] Applying short-text filter")
|
|
37
|
+
cur_langs = (
|
|
38
|
+
metadata.eval_langs[subset]
|
|
39
|
+
if isinstance(metadata.eval_langs, dict) and subset
|
|
40
|
+
else metadata.eval_langs
|
|
41
|
+
)
|
|
42
|
+
apply_short = not any(lang.split("-")[0] in skip_cjk_codes for lang in cur_langs)
|
|
43
|
+
if apply_short:
|
|
44
|
+
logger.info("[clean_dataset] Applying short-text filter")
|
|
45
|
+
transforms.append(("filter_short", filter_short))
|
|
46
|
+
|
|
47
|
+
for split in [train_split, *metadata.eval_splits]:
|
|
48
|
+
if split not in ds:
|
|
49
|
+
logger.warning(f"[clean_dataset] Split '{split}' missing; skipping.")
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
for name, fn in transforms:
|
|
53
|
+
before = len(ds[split])
|
|
54
|
+
ds[split] = fn(ds[split], input_column=input_column)
|
|
55
|
+
logger.info(
|
|
56
|
+
f"[clean_dataset:{split}] {name} removed={before - len(ds[split])}"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
ds = split_train_test(ds, metadata, train_split, label_column)
|
|
60
|
+
|
|
61
|
+
for split in metadata.eval_splits:
|
|
62
|
+
if split == train_split:
|
|
63
|
+
continue
|
|
64
|
+
before = len(ds[split])
|
|
65
|
+
ds[split] = filter_train_leakage(ds[train_split], ds[split], input_column)
|
|
66
|
+
logger.info(
|
|
67
|
+
f"[clean_dataset:{split}] leakage_removed={before - len(ds[split])}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
ds = filter_unclear_label(ds, input_column=input_column, label_column=label_column)
|
|
71
|
+
|
|
72
|
+
logger.info("[clean_dataset] Cleaning pipeline complete.")
|
|
73
|
+
return ds
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def process_classification(
|
|
77
|
+
task: AbsTaskClassification,
|
|
78
|
+
) -> DatasetDict | dict[str, DatasetDict]:
|
|
79
|
+
"""Process classification task dataset(s) with cleaning pipeline."""
|
|
80
|
+
if not task.data_loaded:
|
|
81
|
+
task.load_data()
|
|
82
|
+
if isinstance(task.dataset, DatasetDict):
|
|
83
|
+
return clean_dataset(
|
|
84
|
+
task.dataset,
|
|
85
|
+
task.metadata,
|
|
86
|
+
task.train_split,
|
|
87
|
+
task.input_column_name,
|
|
88
|
+
task.label_column_name,
|
|
89
|
+
subset=None,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
new_ds = {}
|
|
93
|
+
for subset in task.dataset:
|
|
94
|
+
new_ds[subset] = clean_dataset(
|
|
95
|
+
task.dataset[subset],
|
|
96
|
+
task.metadata,
|
|
97
|
+
task.train_split,
|
|
98
|
+
task.input_column_name,
|
|
99
|
+
task.label_column_name,
|
|
100
|
+
subset=subset,
|
|
101
|
+
)
|
|
102
|
+
return new_ds
|
|
@@ -14,6 +14,7 @@ from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
14
14
|
from typing_extensions import override
|
|
15
15
|
|
|
16
16
|
from mteb._create_dataloaders import create_dataloader
|
|
17
|
+
from mteb._evaluators.classification_metrics import hamming_score
|
|
17
18
|
from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
|
|
18
19
|
from mteb.models import EncoderProtocol
|
|
19
20
|
|
|
@@ -40,11 +41,13 @@ class MultilabelClassificationMetrics(TypedDict):
|
|
|
40
41
|
accuracy: Accuracy of the classifier.
|
|
41
42
|
lrap: Label Ranking Average Precision (LRAP) score.
|
|
42
43
|
f1: Macro F1 score.
|
|
44
|
+
hamming: Hamming score (label-based accuracy).
|
|
43
45
|
"""
|
|
44
46
|
|
|
45
47
|
accuracy: float
|
|
46
48
|
lrap: float
|
|
47
49
|
f1: float
|
|
50
|
+
hamming: float
|
|
48
51
|
|
|
49
52
|
|
|
50
53
|
class FullMultilabelClassificationMetrics(MultilabelClassificationMetrics):
|
|
@@ -157,7 +160,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
157
160
|
|
|
158
161
|
logger.info("Running multilabel classification - Evaluating classifiers...")
|
|
159
162
|
all_predictions = []
|
|
160
|
-
for
|
|
163
|
+
for _, sample_indices in enumerate(train_samples):
|
|
161
164
|
X_train = np.stack([unique_train_embeddings[idx] for idx in sample_indices])
|
|
162
165
|
y_train = train_split.select(sample_indices)[self.label_column_name]
|
|
163
166
|
y_train = binarizer.transform(y_train)
|
|
@@ -207,10 +210,12 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
207
210
|
else:
|
|
208
211
|
lrap = label_ranking_average_precision_score(y_test, y_pred)
|
|
209
212
|
f1 = f1_score(y_test, y_pred, average="macro")
|
|
213
|
+
hamming = hamming_score(y_test, y_pred)
|
|
210
214
|
return MultilabelClassificationMetrics(
|
|
211
215
|
accuracy=accuracy,
|
|
212
216
|
lrap=lrap,
|
|
213
217
|
f1=f1,
|
|
218
|
+
hamming=hamming,
|
|
214
219
|
)
|
|
215
220
|
|
|
216
221
|
def _undersample_data_indices(
|
|
@@ -218,6 +223,8 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
218
223
|
) -> tuple[list[int], list[int]]:
|
|
219
224
|
"""Undersample data to have samples_per_label samples of each label.
|
|
220
225
|
|
|
226
|
+
Currently ensures that each label has at least samples_per_label samples.
|
|
227
|
+
|
|
221
228
|
Returns:
|
|
222
229
|
A tuple containing:
|
|
223
230
|
- List of sampled indices.
|
|
@@ -309,7 +309,7 @@ RU_SCI_BENCH = Benchmark(
|
|
|
309
309
|
tasks=get_tasks(
|
|
310
310
|
tasks=[
|
|
311
311
|
# BitextMining
|
|
312
|
-
"RuSciBenchBitextMining",
|
|
312
|
+
"RuSciBenchBitextMining.v2",
|
|
313
313
|
# Classification
|
|
314
314
|
"RuSciBenchCoreRiscClassification",
|
|
315
315
|
"RuSciBenchGRNTIClassification.v2",
|
|
@@ -963,7 +963,7 @@ MTEB_multilingual_v2 = Benchmark(
|
|
|
963
963
|
|
|
964
964
|
MTEB_JPN = Benchmark(
|
|
965
965
|
name="MTEB(jpn, v1)",
|
|
966
|
-
display_name="Japanese",
|
|
966
|
+
display_name="Japanese Legacy",
|
|
967
967
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
|
|
968
968
|
tasks=get_tasks(
|
|
969
969
|
languages=["jpn"],
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 19928,
|
|
4
|
+
"number_of_characters": 35466331,
|
|
5
|
+
"unique_pairs": 19928,
|
|
6
|
+
"sentence1_statistics": {
|
|
7
|
+
"total_text_length": 17733346,
|
|
8
|
+
"min_text_length": 103,
|
|
9
|
+
"average_text_length": 889.8708350060217,
|
|
10
|
+
"max_text_length": 11576,
|
|
11
|
+
"unique_texts": 19928
|
|
12
|
+
},
|
|
13
|
+
"sentence2_statistics": {
|
|
14
|
+
"total_text_length": 17732985,
|
|
15
|
+
"min_text_length": 103,
|
|
16
|
+
"average_text_length": 889.8527197912485,
|
|
17
|
+
"max_text_length": 11576,
|
|
18
|
+
"unique_texts": 19928
|
|
19
|
+
},
|
|
20
|
+
"hf_subset_descriptive_stats": {
|
|
21
|
+
"ru-en": {
|
|
22
|
+
"num_samples": 9965,
|
|
23
|
+
"number_of_characters": 17734926,
|
|
24
|
+
"unique_pairs": 9965,
|
|
25
|
+
"sentence1_statistics": {
|
|
26
|
+
"total_text_length": 8685585,
|
|
27
|
+
"min_text_length": 103,
|
|
28
|
+
"average_text_length": 871.6091319618665,
|
|
29
|
+
"max_text_length": 5675,
|
|
30
|
+
"unique_texts": 9965
|
|
31
|
+
},
|
|
32
|
+
"sentence2_statistics": {
|
|
33
|
+
"total_text_length": 9049341,
|
|
34
|
+
"min_text_length": 106,
|
|
35
|
+
"average_text_length": 908.1124937280482,
|
|
36
|
+
"max_text_length": 11576,
|
|
37
|
+
"unique_texts": 9965
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
"en-ru": {
|
|
41
|
+
"num_samples": 9963,
|
|
42
|
+
"number_of_characters": 17731405,
|
|
43
|
+
"unique_pairs": 9963,
|
|
44
|
+
"sentence1_statistics": {
|
|
45
|
+
"total_text_length": 9047761,
|
|
46
|
+
"min_text_length": 106,
|
|
47
|
+
"average_text_length": 908.1362039546322,
|
|
48
|
+
"max_text_length": 11576,
|
|
49
|
+
"unique_texts": 9963
|
|
50
|
+
},
|
|
51
|
+
"sentence2_statistics": {
|
|
52
|
+
"total_text_length": 8683644,
|
|
53
|
+
"min_text_length": 103,
|
|
54
|
+
"average_text_length": 871.5892803372478,
|
|
55
|
+
"max_text_length": 5675,
|
|
56
|
+
"unique_texts": 9963
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
@@ -71,7 +71,7 @@ GP_BENCHMARK_ENTRIES = [
|
|
|
71
71
|
"MTEB(cmn, v1)",
|
|
72
72
|
"MTEB(deu, v1)",
|
|
73
73
|
"MTEB(fra, v1)",
|
|
74
|
-
"
|
|
74
|
+
"JMTEB(v2)",
|
|
75
75
|
"MTEB(kor, v1)",
|
|
76
76
|
"MTEB(nld, v1)",
|
|
77
77
|
"MTEB(pol, v1)",
|
|
@@ -84,7 +84,12 @@ GP_BENCHMARK_ENTRIES = [
|
|
|
84
84
|
MenuEntry(
|
|
85
85
|
"Other",
|
|
86
86
|
mteb.get_benchmarks(
|
|
87
|
-
[
|
|
87
|
+
[
|
|
88
|
+
"MTEB(eng, v1)",
|
|
89
|
+
"MTEB(fas, v1)",
|
|
90
|
+
"MTEB(rus, v1)",
|
|
91
|
+
"MTEB(jpn, v1)",
|
|
92
|
+
]
|
|
88
93
|
),
|
|
89
94
|
)
|
|
90
95
|
],
|
|
@@ -5,108 +5,10 @@ from mteb.models.model_meta import (
|
|
|
5
5
|
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
6
6
|
from mteb.types import PromptType
|
|
7
7
|
|
|
8
|
+
from .facebookai import XLMR_LANGUAGES
|
|
9
|
+
|
|
8
10
|
E5_PAPER_RELEASE_DATE = "2024-02-08"
|
|
9
|
-
|
|
10
|
-
"afr-Latn",
|
|
11
|
-
"amh-Latn",
|
|
12
|
-
"ara-Latn",
|
|
13
|
-
"asm-Latn",
|
|
14
|
-
"aze-Latn",
|
|
15
|
-
"bel-Latn",
|
|
16
|
-
"bul-Latn",
|
|
17
|
-
"ben-Latn",
|
|
18
|
-
"ben-Beng",
|
|
19
|
-
"bre-Latn",
|
|
20
|
-
"bos-Latn",
|
|
21
|
-
"cat-Latn",
|
|
22
|
-
"ces-Latn",
|
|
23
|
-
"cym-Latn",
|
|
24
|
-
"dan-Latn",
|
|
25
|
-
"deu-Latn",
|
|
26
|
-
"ell-Latn",
|
|
27
|
-
"eng-Latn",
|
|
28
|
-
"epo-Latn",
|
|
29
|
-
"spa-Latn",
|
|
30
|
-
"est-Latn",
|
|
31
|
-
"eus-Latn",
|
|
32
|
-
"fas-Latn",
|
|
33
|
-
"fin-Latn",
|
|
34
|
-
"fra-Latn",
|
|
35
|
-
"fry-Latn",
|
|
36
|
-
"gle-Latn",
|
|
37
|
-
"gla-Latn",
|
|
38
|
-
"glg-Latn",
|
|
39
|
-
"guj-Latn",
|
|
40
|
-
"hau-Latn",
|
|
41
|
-
"heb-Latn",
|
|
42
|
-
"hin-Latn",
|
|
43
|
-
"hin-Deva",
|
|
44
|
-
"hrv-Latn",
|
|
45
|
-
"hun-Latn",
|
|
46
|
-
"hye-Latn",
|
|
47
|
-
"ind-Latn",
|
|
48
|
-
"isl-Latn",
|
|
49
|
-
"ita-Latn",
|
|
50
|
-
"jpn-Latn",
|
|
51
|
-
"jav-Latn",
|
|
52
|
-
"kat-Latn",
|
|
53
|
-
"kaz-Latn",
|
|
54
|
-
"khm-Latn",
|
|
55
|
-
"kan-Latn",
|
|
56
|
-
"kor-Latn",
|
|
57
|
-
"kur-Latn",
|
|
58
|
-
"kir-Latn",
|
|
59
|
-
"lat-Latn",
|
|
60
|
-
"lao-Latn",
|
|
61
|
-
"lit-Latn",
|
|
62
|
-
"lav-Latn",
|
|
63
|
-
"mlg-Latn",
|
|
64
|
-
"mkd-Latn",
|
|
65
|
-
"mal-Latn",
|
|
66
|
-
"mon-Latn",
|
|
67
|
-
"mar-Latn",
|
|
68
|
-
"msa-Latn",
|
|
69
|
-
"mya-Latn",
|
|
70
|
-
"nep-Latn",
|
|
71
|
-
"nld-Latn",
|
|
72
|
-
"nob-Latn",
|
|
73
|
-
"orm-Latn",
|
|
74
|
-
"ori-Latn",
|
|
75
|
-
"pan-Latn",
|
|
76
|
-
"pol-Latn",
|
|
77
|
-
"pus-Latn",
|
|
78
|
-
"por-Latn",
|
|
79
|
-
"ron-Latn",
|
|
80
|
-
"rus-Latn",
|
|
81
|
-
"san-Latn",
|
|
82
|
-
"snd-Latn",
|
|
83
|
-
"sin-Latn",
|
|
84
|
-
"slk-Latn",
|
|
85
|
-
"slv-Latn",
|
|
86
|
-
"som-Latn",
|
|
87
|
-
"sqi-Latn",
|
|
88
|
-
"srp-Latn",
|
|
89
|
-
"sun-Latn",
|
|
90
|
-
"swe-Latn",
|
|
91
|
-
"swa-Latn",
|
|
92
|
-
"tam-Latn",
|
|
93
|
-
"tam-Taml",
|
|
94
|
-
"tel-Latn",
|
|
95
|
-
"tel-Telu",
|
|
96
|
-
"tha-Latn",
|
|
97
|
-
"tgl-Latn",
|
|
98
|
-
"tur-Latn",
|
|
99
|
-
"uig-Latn",
|
|
100
|
-
"ukr-Latn",
|
|
101
|
-
"urd-Latn",
|
|
102
|
-
"urd-Arab",
|
|
103
|
-
"uzb-Latn",
|
|
104
|
-
"vie-Latn",
|
|
105
|
-
"xho-Latn",
|
|
106
|
-
"yid-Latn",
|
|
107
|
-
"zho-Hant",
|
|
108
|
-
"zho-Hans",
|
|
109
|
-
]
|
|
11
|
+
|
|
110
12
|
|
|
111
13
|
MULTILINGUAL_E5_CITATION = """
|
|
112
14
|
@article{wang2024multilingual,
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from mteb.models import sentence_transformers_loader
|
|
2
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
3
|
+
|
|
4
|
+
XLMR_LANGUAGES = [
|
|
5
|
+
"afr-Latn",
|
|
6
|
+
"amh-Latn",
|
|
7
|
+
"ara-Latn",
|
|
8
|
+
"asm-Latn",
|
|
9
|
+
"aze-Latn",
|
|
10
|
+
"bel-Latn",
|
|
11
|
+
"bul-Latn",
|
|
12
|
+
"ben-Latn",
|
|
13
|
+
"ben-Beng",
|
|
14
|
+
"bre-Latn",
|
|
15
|
+
"bos-Latn",
|
|
16
|
+
"cat-Latn",
|
|
17
|
+
"ces-Latn",
|
|
18
|
+
"cym-Latn",
|
|
19
|
+
"dan-Latn",
|
|
20
|
+
"deu-Latn",
|
|
21
|
+
"ell-Latn",
|
|
22
|
+
"eng-Latn",
|
|
23
|
+
"epo-Latn",
|
|
24
|
+
"spa-Latn",
|
|
25
|
+
"est-Latn",
|
|
26
|
+
"eus-Latn",
|
|
27
|
+
"fas-Latn",
|
|
28
|
+
"fin-Latn",
|
|
29
|
+
"fra-Latn",
|
|
30
|
+
"fry-Latn",
|
|
31
|
+
"gle-Latn",
|
|
32
|
+
"gla-Latn",
|
|
33
|
+
"glg-Latn",
|
|
34
|
+
"guj-Latn",
|
|
35
|
+
"hau-Latn",
|
|
36
|
+
"heb-Latn",
|
|
37
|
+
"hin-Latn",
|
|
38
|
+
"hin-Deva",
|
|
39
|
+
"hrv-Latn",
|
|
40
|
+
"hun-Latn",
|
|
41
|
+
"hye-Latn",
|
|
42
|
+
"ind-Latn",
|
|
43
|
+
"isl-Latn",
|
|
44
|
+
"ita-Latn",
|
|
45
|
+
"jpn-Latn",
|
|
46
|
+
"jav-Latn",
|
|
47
|
+
"kat-Latn",
|
|
48
|
+
"kaz-Latn",
|
|
49
|
+
"khm-Latn",
|
|
50
|
+
"kan-Latn",
|
|
51
|
+
"kor-Latn",
|
|
52
|
+
"kur-Latn",
|
|
53
|
+
"kir-Latn",
|
|
54
|
+
"lat-Latn",
|
|
55
|
+
"lao-Latn",
|
|
56
|
+
"lit-Latn",
|
|
57
|
+
"lav-Latn",
|
|
58
|
+
"mlg-Latn",
|
|
59
|
+
"mkd-Latn",
|
|
60
|
+
"mal-Latn",
|
|
61
|
+
"mon-Latn",
|
|
62
|
+
"mar-Latn",
|
|
63
|
+
"msa-Latn",
|
|
64
|
+
"mya-Latn",
|
|
65
|
+
"nep-Latn",
|
|
66
|
+
"nld-Latn",
|
|
67
|
+
"nob-Latn",
|
|
68
|
+
"orm-Latn",
|
|
69
|
+
"ori-Latn",
|
|
70
|
+
"pan-Latn",
|
|
71
|
+
"pol-Latn",
|
|
72
|
+
"pus-Latn",
|
|
73
|
+
"por-Latn",
|
|
74
|
+
"ron-Latn",
|
|
75
|
+
"rus-Latn",
|
|
76
|
+
"san-Latn",
|
|
77
|
+
"snd-Latn",
|
|
78
|
+
"sin-Latn",
|
|
79
|
+
"slk-Latn",
|
|
80
|
+
"slv-Latn",
|
|
81
|
+
"som-Latn",
|
|
82
|
+
"sqi-Latn",
|
|
83
|
+
"srp-Latn",
|
|
84
|
+
"sun-Latn",
|
|
85
|
+
"swe-Latn",
|
|
86
|
+
"swa-Latn",
|
|
87
|
+
"tam-Latn",
|
|
88
|
+
"tam-Taml",
|
|
89
|
+
"tel-Latn",
|
|
90
|
+
"tel-Telu",
|
|
91
|
+
"tha-Latn",
|
|
92
|
+
"tgl-Latn",
|
|
93
|
+
"tur-Latn",
|
|
94
|
+
"uig-Latn",
|
|
95
|
+
"ukr-Latn",
|
|
96
|
+
"urd-Latn",
|
|
97
|
+
"urd-Arab",
|
|
98
|
+
"uzb-Latn",
|
|
99
|
+
"vie-Latn",
|
|
100
|
+
"xho-Latn",
|
|
101
|
+
"yid-Latn",
|
|
102
|
+
"zho-Hant",
|
|
103
|
+
"zho-Hans",
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
xlmr_base = ModelMeta(
|
|
108
|
+
loader=sentence_transformers_loader, # type: ignore[arg-type]
|
|
109
|
+
name="FacebookAI/xlm-roberta-base",
|
|
110
|
+
languages=XLMR_LANGUAGES,
|
|
111
|
+
open_weights=True,
|
|
112
|
+
revision="e73636d4f797dec63c3081bb6ed5c7b0bb3f2089",
|
|
113
|
+
release_date="2019-11-05", # arxiv paper release
|
|
114
|
+
n_parameters=278043648,
|
|
115
|
+
memory_usage_mb=1064,
|
|
116
|
+
embed_dim=768,
|
|
117
|
+
license="mit",
|
|
118
|
+
max_tokens=512,
|
|
119
|
+
reference="https://huggingface.co/FacebookAI/xlm-roberta-base",
|
|
120
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
121
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
122
|
+
use_instructions=False,
|
|
123
|
+
public_training_code=None,
|
|
124
|
+
public_training_data=None,
|
|
125
|
+
training_datasets=set(),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
xlmr_large = ModelMeta(
|
|
129
|
+
loader=sentence_transformers_loader, # type: ignore[arg-type]
|
|
130
|
+
name="FacebookAI/xlm-roberta-large",
|
|
131
|
+
languages=XLMR_LANGUAGES,
|
|
132
|
+
open_weights=True,
|
|
133
|
+
revision="c23d21b0620b635a76227c604d44e43a9f0ee389",
|
|
134
|
+
release_date="2019-11-05", # arxiv paper release
|
|
135
|
+
n_parameters=559890432,
|
|
136
|
+
memory_usage_mb=2141,
|
|
137
|
+
embed_dim=1024,
|
|
138
|
+
license="mit",
|
|
139
|
+
max_tokens=512,
|
|
140
|
+
reference="https://huggingface.co/FacebookAI/xlm-roberta-large",
|
|
141
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
142
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
143
|
+
use_instructions=False,
|
|
144
|
+
public_training_code=None,
|
|
145
|
+
public_training_data=None,
|
|
146
|
+
training_datasets=set(),
|
|
147
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from mteb.models import sentence_transformers_loader
|
|
2
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
3
|
+
|
|
4
|
+
sbert_swedish = ModelMeta(
|
|
5
|
+
loader=sentence_transformers_loader, # type: ignore[arg-type]
|
|
6
|
+
name="KBLab/sentence-bert-swedish-cased",
|
|
7
|
+
languages=["swe-Latn"],
|
|
8
|
+
open_weights=True,
|
|
9
|
+
revision="6b5e83cd29c03729cfdc33d13b1423399b0efb5c",
|
|
10
|
+
release_date="2023-01-11",
|
|
11
|
+
n_parameters=124690944,
|
|
12
|
+
memory_usage_mb=476,
|
|
13
|
+
embed_dim=768,
|
|
14
|
+
license="apache-2.0",
|
|
15
|
+
max_tokens=384,
|
|
16
|
+
reference="https://huggingface.co/KBLab/sentence-bert-swedish-cased",
|
|
17
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
18
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
19
|
+
use_instructions=False,
|
|
20
|
+
public_training_code=None,
|
|
21
|
+
public_training_data=None,
|
|
22
|
+
training_datasets=None,
|
|
23
|
+
adapted_from="sentence-transformers/all-mpnet-base-v2",
|
|
24
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from mteb.models import sentence_transformers_loader
|
|
2
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
3
|
+
|
|
4
|
+
xlmr_scandi = ModelMeta(
|
|
5
|
+
loader=sentence_transformers_loader, # type: ignore[arg-type]
|
|
6
|
+
name="KFST/XLMRoberta-en-da-sv-nb",
|
|
7
|
+
languages=["swe-Latn", "nob-Latn", "nno-Latn", "dan-Latn", "eng-Latn"],
|
|
8
|
+
open_weights=True,
|
|
9
|
+
revision="d40c10ca7b1e68b5a8372f2d112dac9eb3279df1",
|
|
10
|
+
release_date="2022-02-22",
|
|
11
|
+
n_parameters=278043648,
|
|
12
|
+
memory_usage_mb=1061,
|
|
13
|
+
embed_dim=768,
|
|
14
|
+
license="not specified",
|
|
15
|
+
max_tokens=512,
|
|
16
|
+
reference="https://huggingface.co/KFST/XLMRoberta-en-da-sv-nb",
|
|
17
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
18
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
19
|
+
use_instructions=False,
|
|
20
|
+
public_training_code=None,
|
|
21
|
+
public_training_data=None,
|
|
22
|
+
training_datasets=None,
|
|
23
|
+
adapted_from="FacebookAI/xlm-roberta-base",
|
|
24
|
+
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
2
|
+
from mteb.models.sentence_transformer_wrapper import (
|
|
3
|
+
SentenceTransformerEncoderWrapper,
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
nb_sbert = ModelMeta(
|
|
7
|
+
loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type]
|
|
8
|
+
name="NbAiLab/nb-sbert-base",
|
|
9
|
+
languages=["nno-Latn", "nob-Latn", "swe-Latn", "dan-Latn"],
|
|
10
|
+
open_weights=True,
|
|
11
|
+
revision="b95656350a076aeafd2d23763660f80655408cc6",
|
|
12
|
+
release_date="2022-11-23",
|
|
13
|
+
n_parameters=1_780_000_000,
|
|
14
|
+
memory_usage_mb=678,
|
|
15
|
+
embed_dim=4096,
|
|
16
|
+
license="apache-2.0",
|
|
17
|
+
max_tokens=75,
|
|
18
|
+
reference="https://huggingface.co/NbAiLab/nb-sbert-base",
|
|
19
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
20
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
21
|
+
use_instructions=False,
|
|
22
|
+
public_training_code=None,
|
|
23
|
+
public_training_data="https://huggingface.co/datasets/NbAiLab/mnli-norwegian",
|
|
24
|
+
training_datasets=set(),
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
nb_bert_large = ModelMeta(
|
|
28
|
+
loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type]
|
|
29
|
+
name="NbAiLab/nb-bert-large",
|
|
30
|
+
languages=["nno-Latn", "nob-Latn"],
|
|
31
|
+
open_weights=True,
|
|
32
|
+
revision="f9d0fc184adab4dc354d85e1854b7634540d7550",
|
|
33
|
+
release_date="2021-04-29",
|
|
34
|
+
n_parameters=355087360,
|
|
35
|
+
memory_usage_mb=1359,
|
|
36
|
+
embed_dim=1024,
|
|
37
|
+
license="cc-by-4.0",
|
|
38
|
+
max_tokens=512,
|
|
39
|
+
reference="https://huggingface.co/NbAiLab/nb-bert-large",
|
|
40
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
41
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
42
|
+
use_instructions=False,
|
|
43
|
+
public_training_code=None,
|
|
44
|
+
public_training_data="https://huggingface.co/NbAiLab/nb-bert-large#training-data",
|
|
45
|
+
training_datasets=set(),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
nb_bert_base = ModelMeta(
|
|
49
|
+
loader=SentenceTransformerEncoderWrapper, # type: ignore[arg-type]
|
|
50
|
+
name="NbAiLab/nb-bert-base",
|
|
51
|
+
languages=["nno-Latn", "nob-Latn"],
|
|
52
|
+
open_weights=True,
|
|
53
|
+
revision="9417c3f62a3adc99f17ff92bff446f35d011f994",
|
|
54
|
+
release_date="2021-01-13",
|
|
55
|
+
n_parameters=177853440,
|
|
56
|
+
memory_usage_mb=681,
|
|
57
|
+
embed_dim=768,
|
|
58
|
+
license="cc-by-4.0",
|
|
59
|
+
max_tokens=512,
|
|
60
|
+
reference="https://huggingface.co/NbAiLab/nb-bert-base",
|
|
61
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
62
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
63
|
+
use_instructions=False,
|
|
64
|
+
public_training_code=None,
|
|
65
|
+
public_training_data="https://huggingface.co/NbAiLab/nb-bert-base#training-data",
|
|
66
|
+
training_datasets=set(),
|
|
67
|
+
)
|
|
@@ -16,7 +16,7 @@ from .nusa_translation_bitext_mining import NusaTranslationBitextMining
|
|
|
16
16
|
from .nusa_x_bitext_mining import NusaXBitextMining
|
|
17
17
|
from .phinc_bitext_mining import PhincBitextMining
|
|
18
18
|
from .roma_tales_bitext_mining import RomaTalesBitextMining
|
|
19
|
-
from .ru_sci_bench_bitext_mining import RuSciBenchBitextMining
|
|
19
|
+
from .ru_sci_bench_bitext_mining import RuSciBenchBitextMining, RuSciBenchBitextMiningV2
|
|
20
20
|
from .tatoeba_bitext_mining import TatoebaBitextMining
|
|
21
21
|
from .web_faq_bitext_mining import WebFAQBitextMiningQAs, WebFAQBitextMiningQuestions
|
|
22
22
|
|
|
@@ -40,6 +40,7 @@ __all__ = [
|
|
|
40
40
|
"PhincBitextMining",
|
|
41
41
|
"RomaTalesBitextMining",
|
|
42
42
|
"RuSciBenchBitextMining",
|
|
43
|
+
"RuSciBenchBitextMiningV2",
|
|
43
44
|
"TatoebaBitextMining",
|
|
44
45
|
"WebFAQBitextMiningQAs",
|
|
45
46
|
"WebFAQBitextMiningQuestions",
|
|
@@ -42,6 +42,52 @@ class RuSciBenchBitextMining(AbsTaskBitextMining):
|
|
|
42
42
|
volume = {110},
|
|
43
43
|
year = {2024},
|
|
44
44
|
}
|
|
45
|
+
""",
|
|
46
|
+
prompt="Given the following title and abstract of the scientific article, find its translation",
|
|
47
|
+
superseded_by="RuSciBenchBitextMining.v2",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class RuSciBenchBitextMiningV2(AbsTaskBitextMining):
|
|
52
|
+
fast_loading = True
|
|
53
|
+
metadata = TaskMetadata(
|
|
54
|
+
name="RuSciBenchBitextMining.v2",
|
|
55
|
+
dataset={
|
|
56
|
+
"path": "mlsa-iai-msu-lab/ru_sci_bench_bitext_mining",
|
|
57
|
+
"revision": "20e815e8ac8787331546386dfd177821510f79a3",
|
|
58
|
+
},
|
|
59
|
+
description="This task focuses on finding translations of scientific articles. The dataset is sourced from eLibrary, Russia's largest electronic library of scientific publications. Russian authors often provide English translations for their abstracts and titles, and the data consists of these paired titles and abstracts. The task evaluates a model's ability to match an article's Russian title and abstract to its English counterpart, or vice versa. Compared to the previous version, 6 erroneous examples have been removed.",
|
|
60
|
+
reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench_mteb",
|
|
61
|
+
type="BitextMining",
|
|
62
|
+
category="t2c",
|
|
63
|
+
modalities=["text"],
|
|
64
|
+
eval_splits=["test"],
|
|
65
|
+
eval_langs={
|
|
66
|
+
"ru-en": ["rus-Cyrl", "eng-Latn"],
|
|
67
|
+
"en-ru": ["eng-Latn", "rus-Cyrl"],
|
|
68
|
+
},
|
|
69
|
+
main_score="f1",
|
|
70
|
+
date=("2007-01-01", "2023-01-01"),
|
|
71
|
+
domains=["Academic", "Non-fiction", "Written"],
|
|
72
|
+
task_subtypes=[],
|
|
73
|
+
license="not specified",
|
|
74
|
+
dialect=[],
|
|
75
|
+
sample_creation="found",
|
|
76
|
+
annotations_creators="derived",
|
|
77
|
+
bibtex_citation=r"""
|
|
78
|
+
@article{vatolin2024ruscibench,
|
|
79
|
+
author = {Vatolin, A. and Gerasimenko, N. and Ianina, A. and Vorontsov, K.},
|
|
80
|
+
doi = {10.1134/S1064562424602191},
|
|
81
|
+
issn = {1531-8362},
|
|
82
|
+
journal = {Doklady Mathematics},
|
|
83
|
+
month = {12},
|
|
84
|
+
number = {1},
|
|
85
|
+
pages = {S251--S260},
|
|
86
|
+
title = {RuSciBench: Open Benchmark for Russian and English Scientific Document Representations},
|
|
87
|
+
url = {https://doi.org/10.1134/S1064562424602191},
|
|
88
|
+
volume = {110},
|
|
89
|
+
year = {2024},
|
|
90
|
+
}
|
|
45
91
|
""",
|
|
46
92
|
prompt="Given the following title and abstract of the scientific article, find its translation",
|
|
47
93
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.3.
|
|
3
|
+
Version: 2.3.9
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -15,6 +15,7 @@ mteb/similarity_functions.py,sha256=ySSnrKl4cSKOWfyIKQPVTJtxuy2ZNfcv0COXDp22QlQ,
|
|
|
15
15
|
mteb/_evaluators/__init__.py,sha256=Ag1_RWpxBGMpujzd3FZjI40gY_KQKIpY31tJPuk-hFg,1013
|
|
16
16
|
mteb/_evaluators/_download.py,sha256=jntlcURbJxcxUjTmn2D9Tu6ZnWgDc9t5bY8p9CZCqv4,586
|
|
17
17
|
mteb/_evaluators/any_sts_evaluator.py,sha256=f0V3NDP5Bfp8qEeBwP8E-Enj5F5NbFze-kGmzlkObQA,3762
|
|
18
|
+
mteb/_evaluators/classification_metrics.py,sha256=TI-cMPWrIpMqpsNhhwSBY4bZUu2yM469fbcu44zolW0,1926
|
|
18
19
|
mteb/_evaluators/clustering_evaluator.py,sha256=5XoKHl5LcG9jQ9oBzNAWYVpZWWUxrars3t7TdIV7xS0,2052
|
|
19
20
|
mteb/_evaluators/evaluator.py,sha256=gwaeftcAKoGcIQs8jIaafynbcYrYErj6AitHBxgjn2w,807
|
|
20
21
|
mteb/_evaluators/pair_classification_evaluator.py,sha256=6lgDI9wRfEK937YTS4l0W1OL1IQpHYZ4l34-Lxi9KdA,6401
|
|
@@ -37,7 +38,7 @@ mteb/abstasks/classification.py,sha256=Es9pmRdjv6xbc-KnGqVdO6dR1cc7yAHhekCZES7n5
|
|
|
37
38
|
mteb/abstasks/clustering.py,sha256=4KcaU8_sNLmLvMhwDpNmcY2nD3BNyx_LcM-ddSv-wtY,14410
|
|
38
39
|
mteb/abstasks/clustering_legacy.py,sha256=HZY8zgBgqqs5urF_to9wzqm3MnjFivs59hU6P3NrzcI,8684
|
|
39
40
|
mteb/abstasks/dataset_card_template.md,sha256=aD6l8qc3_jxwoIGJNYLzse-jpRa8hu92AxpnUtNgges,5122
|
|
40
|
-
mteb/abstasks/multilabel_classification.py,sha256=
|
|
41
|
+
mteb/abstasks/multilabel_classification.py,sha256=rpIwI3jV2YKtmXlFS2_Ytg4yYjdjPy0q5OU4MsRJFqo,9211
|
|
41
42
|
mteb/abstasks/pair_classification.py,sha256=ToOBFDiokZOz9ea-klMLj_37slbVFR3lSuihP81x9Lc,13263
|
|
42
43
|
mteb/abstasks/regression.py,sha256=SeacOErZUXGLGOcwqAvht6BlbD8fcsn9QhNiFIuJGyc,8832
|
|
43
44
|
mteb/abstasks/retrieval.py,sha256=7QTKYlGaGvF1lOQkB_B4qj8Vm2FxxFXNVTHhfwZO8Bw,26439
|
|
@@ -45,6 +46,9 @@ mteb/abstasks/retrieval_dataset_loaders.py,sha256=WukcFAn54rUpXULCG43eysHozXHAxo
|
|
|
45
46
|
mteb/abstasks/sts.py,sha256=aKTivjvDtAaoYb1hz1NBv2o3UpDR-3AaeHgkDFHMBGI,9077
|
|
46
47
|
mteb/abstasks/task_metadata.py,sha256=7CzYK1y-vwLUiWaEGPgU3HiolpW3UCul8Y2KJ-WSpeE,26892
|
|
47
48
|
mteb/abstasks/zeroshot_classification.py,sha256=4UxBIZ1e1iRK8PRAhCWnnSDirK2vi5-w2N5ZomCnaIM,5882
|
|
49
|
+
mteb/abstasks/_data_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
+
mteb/abstasks/_data_filter/filters.py,sha256=p1QLy7V9jYVFicef61fwzCpbSpTA6rOv8CxkwEUTMvc,4585
|
|
51
|
+
mteb/abstasks/_data_filter/task_pipelines.py,sha256=L56nKTGwLH3QqmzkO-Wx4Vi5vfovnnKIDN_f3M8FSiA,3078
|
|
48
52
|
mteb/abstasks/image/__init__.py,sha256=NgvMJnp1g2mUv27RL-TvzA7s1BOdMG-EB1CrZfdbWdg,136
|
|
49
53
|
mteb/abstasks/image/image_text_pair_classification.py,sha256=SejETTXc3g2VSWYafTe-VAHZcNpX98bgzsWsqQisIzI,7712
|
|
50
54
|
mteb/abstasks/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -56,7 +60,7 @@ mteb/benchmarks/_create_table.py,sha256=OAiR44ynJ2fMzoBmVITQtOTYQzxIu9KUdS_HzlBl
|
|
|
56
60
|
mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
|
|
57
61
|
mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
|
|
58
62
|
mteb/benchmarks/benchmarks/__init__.py,sha256=Ig5dSFunzI-F-OamruuKJVSstbG3xQNkXCxRY3Bj_Ck,2180
|
|
59
|
-
mteb/benchmarks/benchmarks/benchmarks.py,sha256=
|
|
63
|
+
mteb/benchmarks/benchmarks/benchmarks.py,sha256=vWX6QZgqF9iKAE1tIQwaXw9f8q_WiBtdgo8yj4_CHFI,94767
|
|
60
64
|
mteb/benchmarks/benchmarks/rteb_benchmarks.py,sha256=QnCSrTTaBfcRlAQp2Nu81tgv1idMXqiM16Fp2zKJ5Ys,10607
|
|
61
65
|
mteb/cli/__init__.py,sha256=v-csUr3eUZElIvrGB6QGtaIdndDfNWEe9oZchsGsJpg,64
|
|
62
66
|
mteb/cli/_display_tasks.py,sha256=7A06dT9sSoTz6shyMvskPxuc5eHY_H7PGPlROzMP0yw,2196
|
|
@@ -83,6 +87,7 @@ mteb/descriptive_stats/BitextMining/PhincBitextMining.json,sha256=bC31IS_N3-eehB
|
|
|
83
87
|
mteb/descriptive_stats/BitextMining/PubChemSMILESBitextMining.json,sha256=v4OzVwYV3Q-J3VitfK8zX_t2ZOZzvD4WtPeJoFeVNgI,3737
|
|
84
88
|
mteb/descriptive_stats/BitextMining/RomaTalesBitextMining.json,sha256=ciHSHuKgi4Ip0eH4f6G52w4MQ0BFvjizBN1Mh-2tPNE,1415
|
|
85
89
|
mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.json,sha256=f_lS0ua_dtwhmw-zqqZGGPJ4b_4u82VDicM8a71SId8,2209
|
|
90
|
+
mteb/descriptive_stats/BitextMining/RuSciBenchBitextMining.v2.json,sha256=jm6tsOyOooX64kgPl_rLTv_aiHy34Nc1MudlqXgrSpQ,2214
|
|
86
91
|
mteb/descriptive_stats/BitextMining/SAMSumFa.json,sha256=A1o7RjIwoNwjxRoMS6Qmn-4VGy5vX_QDK4sQAjJCM5Y,613
|
|
87
92
|
mteb/descriptive_stats/BitextMining/SRNCorpusBitextMining.json,sha256=41u--q1IXxBXqFb8f_BXYdv9fIlUkbfSPldzwCNbo-w,2136
|
|
88
93
|
mteb/descriptive_stats/BitextMining/SynPerChatbotRAGSumSRetrieval.json,sha256=nNEK8VHlRxDRmMjwuBA4U5RuXNZwxWyCHFCJHQbqIAQ,612
|
|
@@ -1426,7 +1431,7 @@ mteb/languages/language_scripts.py,sha256=5wix9HTYolNIpTiS5oXf2pGJyL7ftdGKs_m432
|
|
|
1426
1431
|
mteb/languages/programming_languages.py,sha256=zxAakT3OSUnAuTnQ34VyeFIECnNXMlleZmAake6jsZE,211
|
|
1427
1432
|
mteb/leaderboard/__init__.py,sha256=991roXmtRwEQysV-37hWEzWpkvPgMCGRqZTHR-hm2io,88
|
|
1428
1433
|
mteb/leaderboard/app.py,sha256=29MxFLKEVT-roULHG5boHmsQVhld1rDGNS94r7MWlz8,33118
|
|
1429
|
-
mteb/leaderboard/benchmark_selector.py,sha256=
|
|
1434
|
+
mteb/leaderboard/benchmark_selector.py,sha256=qd-2L20RQ4ACke01UlytkhZok1dkWgfUlXzfET52kGc,7956
|
|
1430
1435
|
mteb/leaderboard/figures.py,sha256=mPO0go_23QEhAm1RJdLiBxPFCoUiA74_ztyl6yimc7k,7553
|
|
1431
1436
|
mteb/leaderboard/table.py,sha256=6SnrYC5GcBlvVSO6vOk6ObuqtoveBLv3JUuXqdKueG8,8333
|
|
1432
1437
|
mteb/leaderboard/text_segments.py,sha256=iMIkS04QQjPbT-SkU0x6fOcS8xRbUYevryu9HydipKM,6570
|
|
@@ -1472,7 +1477,7 @@ mteb/models/model_implementations/colsmol_models.py,sha256=O2M7Ksydh94M_Iax4KytH
|
|
|
1472
1477
|
mteb/models/model_implementations/conan_models.py,sha256=G-s7xo9VtNX-f7lWKtYVGHHiMMN0Xp44PlNIp7E0LAo,6502
|
|
1473
1478
|
mteb/models/model_implementations/dino_models.py,sha256=QFgaFHR5YKrylqJGSljXCBn2W7qHhmF6KdXkvHrQNEI,16380
|
|
1474
1479
|
mteb/models/model_implementations/e5_instruct.py,sha256=9R4GoSFicgqNDCh3HhTN_8L1qhzuEKvatjHYn3T9zlU,7676
|
|
1475
|
-
mteb/models/model_implementations/e5_models.py,sha256=
|
|
1480
|
+
mteb/models/model_implementations/e5_models.py,sha256=ZLRgzx2uEBc_yWY6DwcJFUNKG6RHpWSEVp1_jaEURhs,9373
|
|
1476
1481
|
mteb/models/model_implementations/e5_v.py,sha256=_9W7I0ryIzx_H9eCkzwdm8iHdGX1LIjKGXkhSh_zNv8,6690
|
|
1477
1482
|
mteb/models/model_implementations/eagerworks_models.py,sha256=NOQkCUqn9jLSpf9p6KyaIHnJxYV1MNlr2z7hO2AcRSc,5744
|
|
1478
1483
|
mteb/models/model_implementations/emillykkejensen_models.py,sha256=QdhGqCm_1-AURkrniZj2S1MjwwIVOPMzLvpgfJq-3EQ,2779
|
|
@@ -1480,6 +1485,7 @@ mteb/models/model_implementations/en_code_retriever.py,sha256=leZ-0M6LrunocY3XQB
|
|
|
1480
1485
|
mteb/models/model_implementations/euler_models.py,sha256=fZoXYeDjSRN2Qj1Pf-ROi8xok03PjhYi4FLEZKjMPkk,905
|
|
1481
1486
|
mteb/models/model_implementations/evaclip_models.py,sha256=cPMGYLDIq4s8zJxb4vPXqJ-rqwPaq7KOh2QZSO6cDas,8000
|
|
1482
1487
|
mteb/models/model_implementations/fa_models.py,sha256=WGal70_ezITWoNdjcMdbOCTSCtoaXzuPadYstLVXxhg,7478
|
|
1488
|
+
mteb/models/model_implementations/facebookai.py,sha256=uhE6rB1YgxE0SIc7u8heE1U62qRFFA23IMgpjxBq_Ok,3116
|
|
1483
1489
|
mteb/models/model_implementations/geogpt_models.py,sha256=Juv86SwhgQX80lVLjAFtim2aSiJT1AcgjniyyiKyk1Q,1923
|
|
1484
1490
|
mteb/models/model_implementations/gme_v_models.py,sha256=NkfgR3_UdZzoBt1NnalVou6LOR-F7qXM4by9EbAVrys,13568
|
|
1485
1491
|
mteb/models/model_implementations/google_models.py,sha256=7QfsaJ5JNDRQxFl7Zh2AtiR2PR7PZcfeCBgviuOFBCo,9130
|
|
@@ -1494,7 +1500,9 @@ mteb/models/model_implementations/jasper_models.py,sha256=ZY7qRRpBpD3eVryQb4rLs5
|
|
|
1494
1500
|
mteb/models/model_implementations/jina_clip.py,sha256=CfiIxbhKspjQajNtObCfGPHOWPk6uLn4cuwydQHFTMo,5118
|
|
1495
1501
|
mteb/models/model_implementations/jina_models.py,sha256=HrHm2Io3g9gHwxU5icAaudy_E8rAVkAAIFSzVYWF-dM,34859
|
|
1496
1502
|
mteb/models/model_implementations/kalm_models.py,sha256=FmW7Z5Qs6WYBLuKvql3u4IJW36kj4k-Ypah8qTBEBkg,59837
|
|
1503
|
+
mteb/models/model_implementations/kblab.py,sha256=DDh8gDEI6YPjS4_yGYWC4HatE0mFf7vhGDU83zzV7V0,866
|
|
1497
1504
|
mteb/models/model_implementations/kennethenevoldsen_models.py,sha256=DF-9nmsewYO9ikZ0kV81ujKGr7Ot36-9iPoxN7KX2mY,2993
|
|
1505
|
+
mteb/models/model_implementations/kfst.py,sha256=BQj0fxMJwyA6NOdK26NDYVL3z2PW1_F-lTTVImxEWZQ,892
|
|
1498
1506
|
mteb/models/model_implementations/kowshik24_models.py,sha256=HoQpybjhquK2XSnawlq0aiSWFI5M7l6N4DNY4MQ-P10,976
|
|
1499
1507
|
mteb/models/model_implementations/lens_models.py,sha256=fC7_NB1F8vBAlXD0p0-hALf6eZTPFJwpz57dy71OlwI,1696
|
|
1500
1508
|
mteb/models/model_implementations/lgai_embedding_models.py,sha256=S83pbfkMH3YUNl4skusgbK-Rn-uLuScQVxgXwegR_N4,2333
|
|
@@ -1510,7 +1518,7 @@ mteb/models/model_implementations/moco_models.py,sha256=Kl0nBsqkG3crYoo5YulFq1fv
|
|
|
1510
1518
|
mteb/models/model_implementations/model2vec_models.py,sha256=D-EY-6P-cKKunbgzk4DHzJL1ogpWYFhpHbTLb8qQjJw,13765
|
|
1511
1519
|
mteb/models/model_implementations/moka_models.py,sha256=Y5do7Z4JyGxabYrjHhkBLqCKTQKotniS-f4kOgXJjag,4995
|
|
1512
1520
|
mteb/models/model_implementations/mxbai_models.py,sha256=33ta2BnhvKYBUgE89wFgPNf-CnOb7ooumZvqHOvbZsA,3593
|
|
1513
|
-
mteb/models/model_implementations/
|
|
1521
|
+
mteb/models/model_implementations/nbailab.py,sha256=bqqR0qs10IH2g5HC6K962tDMBciw8qFsNVHADNS72jk,2396
|
|
1514
1522
|
mteb/models/model_implementations/no_instruct_sentence_models.py,sha256=6i-xbLRRNKuDpU-hwklwdQjgu1wnz5CecLSoc6kyd7Q,3976
|
|
1515
1523
|
mteb/models/model_implementations/nomic_models.py,sha256=mT-v5Gs5-sRH8-ziCw_CtxB9ox3C6FtwWJjNghNrunw,11334
|
|
1516
1524
|
mteb/models/model_implementations/nomic_models_vision.py,sha256=gEEieMThvw4p-QhRH0G_9-WWTvj-jqOlgFsh6O07dbc,6731
|
|
@@ -1596,7 +1604,7 @@ mteb/tasks/bitext_mining/fas/__init__.py,sha256=srw2I-yfcLgIkRzJv-p_qZ9fg0cCKr-D
|
|
|
1596
1604
|
mteb/tasks/bitext_mining/fas/fa_mteb_summary_retrieval.py,sha256=yj8l1K3CIdESwl4sl2M4HhyOrZZYQZb_EDo-EgCAJdg,3330
|
|
1597
1605
|
mteb/tasks/bitext_mining/kat/__init__.py,sha256=a-KcFJ3Ol7R8yq02RcGjaOxEfqnwJeo7AAib-RU-JFw,116
|
|
1598
1606
|
mteb/tasks/bitext_mining/kat/tbilisi_city_hall_bitext_mining.py,sha256=xVCxpJr7UW2KadNdn7Gsw-wZ65uz5vhRDhQZ7eILokQ,1918
|
|
1599
|
-
mteb/tasks/bitext_mining/multilingual/__init__.py,sha256=
|
|
1607
|
+
mteb/tasks/bitext_mining/multilingual/__init__.py,sha256=LcPygeOuvrka67aDkktT-2lSqcxpWPSMmd_BaxIsl24,2012
|
|
1600
1608
|
mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py,sha256=lNbCz3dN9o3F04Y7vtNBhF-lPUNyVbAOKgUR-QKZn_8,29082
|
|
1601
1609
|
mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py,sha256=tTKvS-v7d9V_zymCn_ZonUKlo9NI7vTyppxS9iAu8I0,2873
|
|
1602
1610
|
mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py,sha256=P_UHMWh2gKG6CloXmP5J2kjrCTQwoJAU1MKdLl6JFKc,1836
|
|
@@ -1615,7 +1623,7 @@ mteb/tasks/bitext_mining/multilingual/nusa_translation_bitext_mining.py,sha256=e
|
|
|
1615
1623
|
mteb/tasks/bitext_mining/multilingual/nusa_x_bitext_mining.py,sha256=BphnEDdG1-IsCklJWRCs2yK7I1zVuPh7PQrrYAI540c,2309
|
|
1616
1624
|
mteb/tasks/bitext_mining/multilingual/phinc_bitext_mining.py,sha256=53xLXpgIDk55JfCoe3pa93T_9T9sfRJAryBVcWZx5co,1477
|
|
1617
1625
|
mteb/tasks/bitext_mining/multilingual/roma_tales_bitext_mining.py,sha256=5uwf4vhud5bQuPAcufWGcA7UBmp5YPKsyvc5OUYgK-M,1730
|
|
1618
|
-
mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py,sha256=
|
|
1626
|
+
mteb/tasks/bitext_mining/multilingual/ru_sci_bench_bitext_mining.py,sha256=09e8XDy-zvqpvGMuj8AIbYUGmrggPi6XvOZi9Fbw0G4,4162
|
|
1619
1627
|
mteb/tasks/bitext_mining/multilingual/tatoeba_bitext_mining.py,sha256=Y6QnjbmL4fIuTgK8tuZfNnWmM-GwBVaAjUiAtTSgPqM,5898
|
|
1620
1628
|
mteb/tasks/bitext_mining/multilingual/web_faq_bitext_mining.py,sha256=vWoRmAJKR0ed8mUKDqLUMe_IpFjj7Xlsw0NFhcIOJOc,14978
|
|
1621
1629
|
mteb/tasks/bitext_mining/srn/__init__.py,sha256=XaMVvUagmgLUG6tZw2jo6fMKiVTfQpaaWZGQZo-1YYk,97
|
|
@@ -2573,9 +2581,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2573
2581
|
mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
|
|
2574
2582
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2575
2583
|
mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
|
|
2576
|
-
mteb-2.3.
|
|
2577
|
-
mteb-2.3.
|
|
2578
|
-
mteb-2.3.
|
|
2579
|
-
mteb-2.3.
|
|
2580
|
-
mteb-2.3.
|
|
2581
|
-
mteb-2.3.
|
|
2584
|
+
mteb-2.3.9.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2585
|
+
mteb-2.3.9.dist-info/METADATA,sha256=da_FgK7mGK2HivEwQfKDyIPYzDVMFaz-lTeVQVvp2q8,13923
|
|
2586
|
+
mteb-2.3.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2587
|
+
mteb-2.3.9.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2588
|
+
mteb-2.3.9.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2589
|
+
mteb-2.3.9.dist-info/RECORD,,
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
from mteb.models.model_meta import ModelMeta
|
|
2
|
-
from mteb.models.sentence_transformer_wrapper import (
|
|
3
|
-
SentenceTransformerEncoderWrapper,
|
|
4
|
-
)
|
|
5
|
-
|
|
6
|
-
nb_sbert = ModelMeta(
|
|
7
|
-
loader=SentenceTransformerEncoderWrapper,
|
|
8
|
-
name="NbAiLab/nb-sbert-base",
|
|
9
|
-
languages=["nno-Latn", "nob-Latn", "swe-Latn", "dan-Latn"],
|
|
10
|
-
open_weights=True,
|
|
11
|
-
revision="b95656350a076aeafd2d23763660f80655408cc6",
|
|
12
|
-
release_date="2022-11-23",
|
|
13
|
-
n_parameters=1_780_000_000,
|
|
14
|
-
memory_usage_mb=678,
|
|
15
|
-
embed_dim=4096,
|
|
16
|
-
license="apache-2.0",
|
|
17
|
-
max_tokens=75,
|
|
18
|
-
reference="https://huggingface.co/NbAiLab/nb-sbert-base",
|
|
19
|
-
similarity_fn_name="cosine",
|
|
20
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
21
|
-
use_instructions=False,
|
|
22
|
-
public_training_code=None,
|
|
23
|
-
public_training_data="https://huggingface.co/datasets/NbAiLab/mnli-norwegian",
|
|
24
|
-
training_datasets=set(),
|
|
25
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|