mteb 2.3.2__py3-none-any.whl → 2.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/benchmarks/_create_table.py +23 -34
- mteb/leaderboard/app.py +0 -3
- mteb/leaderboard/figures.py +13 -15
- mteb/leaderboard/table.py +17 -2
- mteb/models/model_implementations/clips_models.py +97 -0
- mteb/models/model_implementations/cohere_models.py +3 -0
- mteb/models/model_implementations/google_models.py +0 -4
- mteb/models/model_implementations/kennethenevoldsen_models.py +72 -0
- mteb/models/model_implementations/linq_models.py +2 -2
- {mteb-2.3.2.dist-info → mteb-2.3.3.dist-info}/METADATA +3 -3
- {mteb-2.3.2.dist-info → mteb-2.3.3.dist-info}/RECORD +15 -13
- {mteb-2.3.2.dist-info → mteb-2.3.3.dist-info}/WHEEL +0 -0
- {mteb-2.3.2.dist-info → mteb-2.3.3.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.2.dist-info → mteb-2.3.3.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.2.dist-info → mteb-2.3.3.dist-info}/top_level.txt +0 -0
mteb/benchmarks/_create_table.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import math
|
|
2
1
|
import re
|
|
3
2
|
from collections import defaultdict
|
|
4
3
|
|
|
@@ -32,26 +31,18 @@ def _split_on_capital(s: str) -> str:
|
|
|
32
31
|
return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s))
|
|
33
32
|
|
|
34
33
|
|
|
35
|
-
def _format_n_parameters(n_parameters) ->
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
n_zeros = math.log10(n_thousand)
|
|
42
|
-
if n_zeros >= 6:
|
|
43
|
-
return str(n_thousand // (10**6)) + "B"
|
|
44
|
-
if n_zeros >= 3:
|
|
45
|
-
return str(n_thousand // (10**3)) + "M"
|
|
46
|
-
return str(n_thousand) + "K"
|
|
34
|
+
def _format_n_parameters(n_parameters) -> float | None:
|
|
35
|
+
"""Format n_parameters to be in billions with decimals down to 1 million. I.e. 7M -> 0.007B, 1.5B -> 1.5B, None -> None"""
|
|
36
|
+
if n_parameters:
|
|
37
|
+
n_parameters = float(n_parameters)
|
|
38
|
+
return round(n_parameters / 1e9, 3)
|
|
39
|
+
return None
|
|
47
40
|
|
|
48
41
|
|
|
49
|
-
def _format_max_tokens(max_tokens: float | None) ->
|
|
50
|
-
if max_tokens is None:
|
|
51
|
-
return
|
|
52
|
-
|
|
53
|
-
return "Infinite"
|
|
54
|
-
return str(int(max_tokens))
|
|
42
|
+
def _format_max_tokens(max_tokens: float | None) -> float | None:
|
|
43
|
+
if max_tokens is None or max_tokens == np.inf:
|
|
44
|
+
return None
|
|
45
|
+
return float(max_tokens)
|
|
55
46
|
|
|
56
47
|
|
|
57
48
|
def _get_means_per_types(per_task: pd.DataFrame):
|
|
@@ -144,18 +135,18 @@ def _create_summary_table_from_benchmark_results(
|
|
|
144
135
|
joint_table.insert(
|
|
145
136
|
1,
|
|
146
137
|
"Embedding Dimensions",
|
|
147
|
-
model_metas.map(lambda m:
|
|
138
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
148
139
|
)
|
|
149
140
|
joint_table.insert(
|
|
150
141
|
1,
|
|
151
|
-
"Number of Parameters",
|
|
142
|
+
"Number of Parameters (B)",
|
|
152
143
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
153
144
|
)
|
|
154
145
|
joint_table.insert(
|
|
155
146
|
1,
|
|
156
147
|
"Memory Usage (MB)",
|
|
157
148
|
model_metas.map(
|
|
158
|
-
lambda m:
|
|
149
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
159
150
|
),
|
|
160
151
|
)
|
|
161
152
|
|
|
@@ -323,18 +314,18 @@ def _create_summary_table_mean_public_private(
|
|
|
323
314
|
joint_table.insert(
|
|
324
315
|
1,
|
|
325
316
|
"Embedding Dimensions",
|
|
326
|
-
model_metas.map(lambda m:
|
|
317
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
327
318
|
)
|
|
328
319
|
joint_table.insert(
|
|
329
320
|
1,
|
|
330
|
-
"Number of Parameters",
|
|
321
|
+
"Number of Parameters (B)",
|
|
331
322
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
332
323
|
)
|
|
333
324
|
joint_table.insert(
|
|
334
325
|
1,
|
|
335
326
|
"Memory Usage (MB)",
|
|
336
327
|
model_metas.map(
|
|
337
|
-
lambda m:
|
|
328
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
338
329
|
),
|
|
339
330
|
)
|
|
340
331
|
|
|
@@ -445,18 +436,18 @@ def _create_summary_table_mean_subset(
|
|
|
445
436
|
joint_table.insert(
|
|
446
437
|
1,
|
|
447
438
|
"Embedding Dimensions",
|
|
448
|
-
model_metas.map(lambda m:
|
|
439
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
449
440
|
)
|
|
450
441
|
joint_table.insert(
|
|
451
442
|
1,
|
|
452
|
-
"Number of Parameters",
|
|
443
|
+
"Number of Parameters (B)",
|
|
453
444
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
454
445
|
)
|
|
455
446
|
joint_table.insert(
|
|
456
447
|
1,
|
|
457
448
|
"Memory Usage (MB)",
|
|
458
449
|
model_metas.map(
|
|
459
|
-
lambda m:
|
|
450
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
460
451
|
),
|
|
461
452
|
)
|
|
462
453
|
|
|
@@ -558,25 +549,23 @@ def _create_summary_table_mean_task_type(
|
|
|
558
549
|
|
|
559
550
|
# Insert model metadata columns
|
|
560
551
|
joint_table.insert(
|
|
561
|
-
1,
|
|
562
|
-
"Max Tokens",
|
|
563
|
-
model_metas.map(lambda m: _format_max_tokens(m.max_tokens)),
|
|
552
|
+
1, "Max Tokens", model_metas.map(lambda m: _format_max_tokens(m.max_tokens))
|
|
564
553
|
)
|
|
565
554
|
joint_table.insert(
|
|
566
555
|
1,
|
|
567
556
|
"Embedding Dimensions",
|
|
568
|
-
model_metas.map(lambda m:
|
|
557
|
+
model_metas.map(lambda m: int(m.embed_dim) if m.embed_dim else None),
|
|
569
558
|
)
|
|
570
559
|
joint_table.insert(
|
|
571
560
|
1,
|
|
572
|
-
"Number of Parameters",
|
|
561
|
+
"Number of Parameters (B)",
|
|
573
562
|
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
|
|
574
563
|
)
|
|
575
564
|
joint_table.insert(
|
|
576
565
|
1,
|
|
577
566
|
"Memory Usage (MB)",
|
|
578
567
|
model_metas.map(
|
|
579
|
-
lambda m:
|
|
568
|
+
lambda m: int(m.memory_usage_mb) if m.memory_usage_mb else None
|
|
580
569
|
),
|
|
581
570
|
)
|
|
582
571
|
|
mteb/leaderboard/app.py
CHANGED
|
@@ -535,9 +535,6 @@ def get_leaderboard_app(cache: ResultCache = ResultCache()) -> gr.Blocks:
|
|
|
535
535
|
|
|
536
536
|
with gr.Tab("Performance per Model Size") as plot_tab:
|
|
537
537
|
plot = gr.Plot(_performance_size_plot, inputs=[summary_table])
|
|
538
|
-
gr.Markdown(
|
|
539
|
-
"*We only display TOP 5 models that have been run on all tasks in the benchmark*"
|
|
540
|
-
)
|
|
541
538
|
plot_tab.select(
|
|
542
539
|
_performance_size_plot, inputs=[summary_table], outputs=[plot]
|
|
543
540
|
)
|
mteb/leaderboard/figures.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from typing import get_args
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
@@ -7,6 +8,8 @@ import plotly.graph_objects as go
|
|
|
7
8
|
|
|
8
9
|
from mteb.abstasks.task_metadata import TaskType
|
|
9
10
|
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
10
13
|
|
|
11
14
|
def _text_plot(text: str):
|
|
12
15
|
"""Returns empty scatter plot with text added, this can be great for error messages."""
|
|
@@ -29,16 +32,17 @@ def _failsafe_plot(fun):
|
|
|
29
32
|
try:
|
|
30
33
|
return fun(*args, **kwargs)
|
|
31
34
|
except Exception as e:
|
|
35
|
+
logger.error(f"Plot generation failed: {e}")
|
|
32
36
|
return _text_plot(f"Couldn't produce plot. Reason: {e}")
|
|
33
37
|
|
|
34
38
|
return wrapper
|
|
35
39
|
|
|
36
40
|
|
|
37
|
-
def _parse_n_params(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
41
|
+
def _parse_n_params(params: float | None) -> int | float:
|
|
42
|
+
"""Specified in billions."""
|
|
43
|
+
if params is None or np.isnan(params):
|
|
44
|
+
return None
|
|
45
|
+
return int(params * 1e9)
|
|
42
46
|
|
|
43
47
|
|
|
44
48
|
def _parse_model_name(name: str) -> str:
|
|
@@ -51,20 +55,14 @@ def _parse_model_name(name: str) -> str:
|
|
|
51
55
|
|
|
52
56
|
|
|
53
57
|
def _parse_float(value) -> float:
|
|
54
|
-
|
|
55
|
-
if value == "Infinite":
|
|
56
|
-
return np.inf
|
|
57
|
-
else:
|
|
58
|
-
return float(value)
|
|
59
|
-
except ValueError:
|
|
58
|
+
if value is None or np.isnan(value):
|
|
60
59
|
return np.nan
|
|
60
|
+
return float(value)
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
def _process_max_tokens(x):
|
|
64
|
-
if pd.isna(x):
|
|
64
|
+
if pd.isna(x) or x is None or np.isinf(x):
|
|
65
65
|
return "Unknown"
|
|
66
|
-
if np.isinf(x):
|
|
67
|
-
return "Infinite"
|
|
68
66
|
return str(int(x))
|
|
69
67
|
|
|
70
68
|
|
|
@@ -112,7 +110,7 @@ def _add_size_guide(fig: go.Figure):
|
|
|
112
110
|
@_failsafe_plot
|
|
113
111
|
def _performance_size_plot(df: pd.DataFrame) -> go.Figure:
|
|
114
112
|
df = df.copy()
|
|
115
|
-
df["Number of Parameters"] = df["Number of Parameters"].map(_parse_n_params)
|
|
113
|
+
df["Number of Parameters"] = df["Number of Parameters (B)"].map(_parse_n_params)
|
|
116
114
|
df["Model"] = df["Model"].map(_parse_model_name)
|
|
117
115
|
df["model_text"] = df["Model"].where(df["Model"].isin(models_to_annotate), "")
|
|
118
116
|
df["Embedding Dimensions"] = df["Embedding Dimensions"].map(_parse_float)
|
mteb/leaderboard/table.py
CHANGED
|
@@ -120,6 +120,14 @@ def apply_per_task_styling_from_benchmark(
|
|
|
120
120
|
return _apply_per_task_table_styling(per_task_df)
|
|
121
121
|
|
|
122
122
|
|
|
123
|
+
def _style_number_of_parameters(num_params: float) -> str:
|
|
124
|
+
"""Anything bigger than 1B is shown in billions with 1 decimal (e.g. 1.712 > 1.7) while anything smaller as 0.xxx B (e.g. 0.345 remains 0.345)"""
|
|
125
|
+
if num_params >= 1:
|
|
126
|
+
return f"{num_params:.1f}"
|
|
127
|
+
else:
|
|
128
|
+
return f"{num_params:.3f}"
|
|
129
|
+
|
|
130
|
+
|
|
123
131
|
def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
|
|
124
132
|
"""Apply styling to a raw summary DataFrame
|
|
125
133
|
|
|
@@ -130,7 +138,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
|
|
|
130
138
|
"Rank (Borda)",
|
|
131
139
|
"Rank",
|
|
132
140
|
"Model",
|
|
133
|
-
"Number of Parameters",
|
|
141
|
+
"Number of Parameters (B)",
|
|
134
142
|
"Embedding Dimensions",
|
|
135
143
|
"Max Tokens",
|
|
136
144
|
"Memory Usage (MB)",
|
|
@@ -156,7 +164,14 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
|
|
|
156
164
|
joint_table[score_columns] = joint_table[score_columns].map(_format_scores)
|
|
157
165
|
|
|
158
166
|
joint_table_style = joint_table.style.format(
|
|
159
|
-
{
|
|
167
|
+
{
|
|
168
|
+
**dict.fromkeys(score_columns, "{:.2f}"),
|
|
169
|
+
"Rank (Borda)": "{:.0f}",
|
|
170
|
+
"Memory Usage (MB)": "{:.0f}",
|
|
171
|
+
"Embedding Dimensions": "{:.0f}",
|
|
172
|
+
"Max Tokens": "{:.0f}",
|
|
173
|
+
"Number of Parameters (B)": lambda x: _style_number_of_parameters(x),
|
|
174
|
+
},
|
|
160
175
|
na_rep="",
|
|
161
176
|
)
|
|
162
177
|
joint_table_style = joint_table_style.highlight_min(
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from mteb.models.model_meta import (
|
|
2
|
+
ModelMeta,
|
|
3
|
+
ScoringFunction,
|
|
4
|
+
)
|
|
5
|
+
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
6
|
+
|
|
7
|
+
from .e5_models import ME5_TRAINING_DATA, model_prompts
|
|
8
|
+
|
|
9
|
+
E5_NL_CITATION = """
|
|
10
|
+
@misc{banar2025mtebnle5nlembeddingbenchmark,
|
|
11
|
+
archiveprefix = {arXiv},
|
|
12
|
+
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
|
|
13
|
+
eprint = {2509.12340},
|
|
14
|
+
primaryclass = {cs.CL},
|
|
15
|
+
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
|
|
16
|
+
url = {https://arxiv.org/abs/2509.12340},
|
|
17
|
+
year = {2025},
|
|
18
|
+
}
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
e5_nl_small = ModelMeta(
|
|
22
|
+
loader=sentence_transformers_loader,
|
|
23
|
+
loader_kwargs=dict(
|
|
24
|
+
model_prompts=model_prompts,
|
|
25
|
+
),
|
|
26
|
+
name="clips/e5-small-trm-nl",
|
|
27
|
+
languages=["nld-Latn"],
|
|
28
|
+
open_weights=True,
|
|
29
|
+
revision="0243664a6c5e12eef854b091eb283e51833c3e9f",
|
|
30
|
+
release_date="2025-09-23",
|
|
31
|
+
n_parameters=40_800_000,
|
|
32
|
+
memory_usage_mb=78,
|
|
33
|
+
embed_dim=384,
|
|
34
|
+
license="mit",
|
|
35
|
+
max_tokens=512,
|
|
36
|
+
reference="https://huggingface.co/clips/e5-small-trm-nl",
|
|
37
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
38
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
39
|
+
use_instructions=True,
|
|
40
|
+
public_training_code="https://github.com/ELotfi/e5-nl",
|
|
41
|
+
public_training_data="https://huggingface.co/collections/clips/beir-nl",
|
|
42
|
+
training_datasets=ME5_TRAINING_DATA, # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
|
|
43
|
+
adapted_from="intfloat/multilingual-e5-small",
|
|
44
|
+
citation=E5_NL_CITATION,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
e5_nl_base = ModelMeta(
|
|
48
|
+
loader=sentence_transformers_loader,
|
|
49
|
+
loader_kwargs=dict(
|
|
50
|
+
model_prompts=model_prompts,
|
|
51
|
+
),
|
|
52
|
+
name="clips/e5-base-trm-nl",
|
|
53
|
+
languages=["nld-Latn"],
|
|
54
|
+
open_weights=True,
|
|
55
|
+
revision="6bd5722f236da48b4b8bcb28cc1fc478f7089956",
|
|
56
|
+
release_date="2025-09-23",
|
|
57
|
+
n_parameters=124_400_000,
|
|
58
|
+
memory_usage_mb=237,
|
|
59
|
+
embed_dim=768,
|
|
60
|
+
license="mit",
|
|
61
|
+
max_tokens=514,
|
|
62
|
+
reference="https://huggingface.co/clips/e5-base-trm-nl",
|
|
63
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
64
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
65
|
+
use_instructions=True,
|
|
66
|
+
public_training_code="https://github.com/ELotfi/e5-nl",
|
|
67
|
+
public_training_data="https://huggingface.co/collections/clips/beir-nl",
|
|
68
|
+
adapted_from="intfloat/multilingual-e5-base",
|
|
69
|
+
training_datasets=ME5_TRAINING_DATA, # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
|
|
70
|
+
citation=E5_NL_CITATION,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
e5_nl_large = ModelMeta(
|
|
74
|
+
loader=sentence_transformers_loader,
|
|
75
|
+
loader_kwargs=dict(
|
|
76
|
+
model_prompts=model_prompts,
|
|
77
|
+
),
|
|
78
|
+
name="clips/e5-large-trm-nl",
|
|
79
|
+
languages=["nld-Latn"],
|
|
80
|
+
open_weights=True,
|
|
81
|
+
revision="683333f86ed9eb3699b5567f0fdabeb958d412b0",
|
|
82
|
+
release_date="2025-09-23",
|
|
83
|
+
n_parameters=355_000_000,
|
|
84
|
+
memory_usage_mb=1355,
|
|
85
|
+
embed_dim=1024,
|
|
86
|
+
license="mit",
|
|
87
|
+
max_tokens=514,
|
|
88
|
+
reference="https://huggingface.co/clips/e5-large-trm-nl",
|
|
89
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
90
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
91
|
+
use_instructions=True,
|
|
92
|
+
public_training_code="https://github.com/ELotfi/e5-nl",
|
|
93
|
+
public_training_data="https://huggingface.co/collections/clips/beir-nl",
|
|
94
|
+
training_datasets=ME5_TRAINING_DATA, # mMARCO-NL, HotpotQA-NL, FEVER-NL, and LLM generated data
|
|
95
|
+
adapted_from="intfloat/multilingual-e5-large",
|
|
96
|
+
citation=E5_NL_CITATION,
|
|
97
|
+
)
|
|
@@ -8,6 +8,7 @@ import torch
|
|
|
8
8
|
from torch.utils.data import DataLoader
|
|
9
9
|
from tqdm.auto import tqdm
|
|
10
10
|
|
|
11
|
+
from mteb._requires_package import requires_package
|
|
11
12
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
13
|
from mteb.models.abs_encoder import AbsEncoder
|
|
13
14
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
@@ -219,6 +220,8 @@ class CohereTextEmbeddingModel(AbsEncoder):
|
|
|
219
220
|
output_dimension: int | None = None,
|
|
220
221
|
**kwargs,
|
|
221
222
|
) -> None:
|
|
223
|
+
requires_package(self, "cohere", model_name, "pip install 'mteb[cohere]'")
|
|
224
|
+
|
|
222
225
|
import cohere # type: ignore
|
|
223
226
|
|
|
224
227
|
self.model_name = model_name.removeprefix("Cohere/Cohere-")
|
|
@@ -147,7 +147,6 @@ class GoogleTextEmbeddingModel(AbsEncoder):
|
|
|
147
147
|
google_text_emb_004 = ModelMeta(
|
|
148
148
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
149
149
|
loader_kwargs=dict(
|
|
150
|
-
model_name="text-embedding-004",
|
|
151
150
|
model_prompts=MODEL_PROMPTS,
|
|
152
151
|
),
|
|
153
152
|
name="google/text-embedding-004",
|
|
@@ -172,7 +171,6 @@ google_text_emb_004 = ModelMeta(
|
|
|
172
171
|
google_text_emb_005 = ModelMeta(
|
|
173
172
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
174
173
|
loader_kwargs=dict(
|
|
175
|
-
model_name="text-embedding-005",
|
|
176
174
|
model_prompts=MODEL_PROMPTS,
|
|
177
175
|
),
|
|
178
176
|
name="google/text-embedding-005",
|
|
@@ -197,7 +195,6 @@ google_text_emb_005 = ModelMeta(
|
|
|
197
195
|
google_text_multilingual_emb_002 = ModelMeta(
|
|
198
196
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
199
197
|
loader_kwargs=dict(
|
|
200
|
-
model_name="text-embedding-002",
|
|
201
198
|
model_prompts=MODEL_PROMPTS,
|
|
202
199
|
),
|
|
203
200
|
name="google/text-multilingual-embedding-002",
|
|
@@ -222,7 +219,6 @@ google_text_multilingual_emb_002 = ModelMeta(
|
|
|
222
219
|
google_gemini_embedding_001 = ModelMeta(
|
|
223
220
|
loader=GoogleTextEmbeddingModel, # type: ignore[call-arg]
|
|
224
221
|
loader_kwargs=dict(
|
|
225
|
-
model_name="gemini-embedding-001",
|
|
226
222
|
model_prompts=MODEL_PROMPTS,
|
|
227
223
|
),
|
|
228
224
|
name="google/gemini-embedding-001",
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
2
|
+
from mteb.models.sentence_transformer_wrapper import (
|
|
3
|
+
sentence_transformers_loader,
|
|
4
|
+
)
|
|
5
|
+
|
|
6
|
+
dfm_enc_large = ModelMeta(
|
|
7
|
+
loader=sentence_transformers_loader, # type: ignore
|
|
8
|
+
name="KennethEnevoldsen/dfm-sentence-encoder-large",
|
|
9
|
+
languages=["dan-Latn"],
|
|
10
|
+
open_weights=True,
|
|
11
|
+
revision="132c53391e7a780dc6a2f9a03724d0158fe7122c",
|
|
12
|
+
release_date="2023-07-12",
|
|
13
|
+
n_parameters=355087360,
|
|
14
|
+
memory_usage_mb=1554,
|
|
15
|
+
embed_dim=1024,
|
|
16
|
+
license="mit",
|
|
17
|
+
max_tokens=512,
|
|
18
|
+
reference="https://huggingface.co/KennethEnevoldsen/dfm-sentence-encoder-large",
|
|
19
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
20
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
21
|
+
use_instructions=False,
|
|
22
|
+
superseded_by=None,
|
|
23
|
+
adapted_from="chcaa/dfm-encoder-large-v1",
|
|
24
|
+
training_datasets=set(), # just contrastive pre-training
|
|
25
|
+
public_training_code="https://huggingface.co/KennethEnevoldsen/dfm-sentence-encoder-large#hyperparameters",
|
|
26
|
+
citation="""@article{enevoldsenScandinavianEmbeddingBenchmarks2024,
|
|
27
|
+
title = {The {Scandinavian} {Embedding} {Benchmarks}: {Comprehensive} {Assessment} of {Multilingual} and {Monolingual} {Text} {Embedding}},
|
|
28
|
+
shorttitle = {The {Scandinavian} {Embedding} {Benchmarks}},
|
|
29
|
+
url = {https://openreview.net/forum?id=pJl_i7HIA72},
|
|
30
|
+
language = {en},
|
|
31
|
+
urldate = {2024-04-12},
|
|
32
|
+
author = {Enevoldsen, Kenneth and Kardos, Márton and Muennighoff, Niklas and Nielbo, Kristoffer},
|
|
33
|
+
month = feb,
|
|
34
|
+
year = {2024},
|
|
35
|
+
}
|
|
36
|
+
""",
|
|
37
|
+
public_training_data="https://huggingface.co/datasets/danish-foundation-models/danish-gigaword", # paragraphs extracted from Danish Gigaword
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
dfm_enc_med = ModelMeta(
|
|
41
|
+
loader=sentence_transformers_loader, # type: ignore
|
|
42
|
+
name="KennethEnevoldsen/dfm-sentence-encoder-medium",
|
|
43
|
+
languages=["dan-Latn"],
|
|
44
|
+
open_weights=True,
|
|
45
|
+
revision="701bce95d499fa97610d57e8823c54fd1fb79930",
|
|
46
|
+
release_date="2023-07-12",
|
|
47
|
+
n_parameters=124445952,
|
|
48
|
+
memory_usage_mb=475,
|
|
49
|
+
embed_dim=768,
|
|
50
|
+
license="mit",
|
|
51
|
+
max_tokens=512,
|
|
52
|
+
reference="https://huggingface.co/KennethEnevoldsen/dfm-sentence-encoder-medium",
|
|
53
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
54
|
+
framework=["Sentence Transformers", "PyTorch"],
|
|
55
|
+
use_instructions=False,
|
|
56
|
+
superseded_by=None,
|
|
57
|
+
adapted_from=None,
|
|
58
|
+
public_training_code=None,
|
|
59
|
+
training_datasets=set(), # just contrastive pre-training
|
|
60
|
+
citation="""@article{enevoldsenScandinavianEmbeddingBenchmarks2024,
|
|
61
|
+
title = {The {Scandinavian} {Embedding} {Benchmarks}: {Comprehensive} {Assessment} of {Multilingual} and {Monolingual} {Text} {Embedding}},
|
|
62
|
+
shorttitle = {The {Scandinavian} {Embedding} {Benchmarks}},
|
|
63
|
+
url = {https://openreview.net/forum?id=pJl_i7HIA72},
|
|
64
|
+
language = {en},
|
|
65
|
+
urldate = {2024-04-12},
|
|
66
|
+
author = {Enevoldsen, Kenneth and Kardos, Márton and Muennighoff, Niklas and Nielbo, Kristoffer},
|
|
67
|
+
month = feb,
|
|
68
|
+
year = {2024},
|
|
69
|
+
}
|
|
70
|
+
""",
|
|
71
|
+
public_training_data=None,
|
|
72
|
+
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
|
|
3
|
+
from mteb.models.instruct_wrapper import instruct_wrapper
|
|
3
4
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
4
|
-
from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
|
|
5
5
|
from mteb.types import PromptType
|
|
6
6
|
|
|
7
7
|
from .e5_instruct import E5_MISTRAL_TRAINING_DATA
|
|
@@ -22,7 +22,7 @@ def instruction_template(
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
Linq_Embed_Mistral = ModelMeta(
|
|
25
|
-
loader=
|
|
25
|
+
loader=instruct_wrapper,
|
|
26
26
|
loader_kwargs=dict(
|
|
27
27
|
instruction_template=instruction_template,
|
|
28
28
|
attn="cccc",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.3.
|
|
3
|
+
Version: 2.3.3
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -108,7 +108,7 @@ Requires-Dist: qwen_vl_utils>=0.0.14; extra == "eager-embed"
|
|
|
108
108
|
Dynamic: license-file
|
|
109
109
|
|
|
110
110
|
<h1 align="center">
|
|
111
|
-
<img src="docs/images/logos/mteb_logo/dots-icon.png" alt="MTEB" width="28" style="vertical-align: middle; margin-right: 10px;"/> MTEB
|
|
111
|
+
<img src="https://github.com/embeddings-benchmark/mteb/blob/main/docs/images/logos/mteb_logo/dots-icon.png?raw=true" alt="MTEB" width="28" style="vertical-align: middle; margin-right: 10px;"/> MTEB
|
|
112
112
|
</h1>
|
|
113
113
|
|
|
114
114
|
<h3 align="center" style="border-bottom: none;">Multimodal toolbox for evaluating embeddings and retrieval systems</h3>
|
|
@@ -137,7 +137,7 @@ Dynamic: license-file
|
|
|
137
137
|
|
|
138
138
|
|
|
139
139
|
<h3 align="center">
|
|
140
|
-
<a href="https://huggingface.co/spaces/mteb/leaderboard"><img style="float: middle; padding: 10px 10px 10px 10px;" width="60" height="55" src="
|
|
140
|
+
<a href="https://huggingface.co/spaces/mteb/leaderboard"><img style="float: middle; padding: 10px 10px 10px 10px;" width="60" height="55" src="https://github.com/embeddings-benchmark/mteb/blob/main/docs/images/logos/hf_logo.png?raw=true" /></a>
|
|
141
141
|
</h3>
|
|
142
142
|
|
|
143
143
|
|
|
@@ -52,7 +52,7 @@ mteb/abstasks/text/bitext_mining.py,sha256=8m86XHJ3TxguC9itxZRq2Bt_p0NYojojS2Btk
|
|
|
52
52
|
mteb/abstasks/text/reranking.py,sha256=rfRGRBeSjZLgkh8pneMgRm-vd9NHr5jSFH92YfOHfmU,7776
|
|
53
53
|
mteb/abstasks/text/summarization.py,sha256=KYEb8gh4JjpSsrvGUmQ2VlrVdzzVxIWcitXOJUaHhO4,6954
|
|
54
54
|
mteb/benchmarks/__init__.py,sha256=MQEVeli-zLaJ7Xg0z7RhXQwsdmm7Ht_W2Ln0rZo1Szc,225
|
|
55
|
-
mteb/benchmarks/_create_table.py,sha256=
|
|
55
|
+
mteb/benchmarks/_create_table.py,sha256=OAiR44ynJ2fMzoBmVITQtOTYQzxIu9KUdS_HzlBlAck,20195
|
|
56
56
|
mteb/benchmarks/benchmark.py,sha256=70RlMyyg_wkWTlU_IbfLl-KaqRWXGCKTd8fWe9X-AQE,4173
|
|
57
57
|
mteb/benchmarks/get_benchmark.py,sha256=-n_O-gitRKZi48gJKNgGuI36hsP7yLVSiwulnMHN7Gw,3935
|
|
58
58
|
mteb/benchmarks/benchmarks/__init__.py,sha256=0ySgD14Mu3Y1nJzazR_eUir81ia3x6E23N57SzQNkF0,2150
|
|
@@ -1424,10 +1424,10 @@ mteb/languages/language_family.json,sha256=OUGcHeOIPcZPb2FWmYLhxTS0JxjK5y3Fo6x0P
|
|
|
1424
1424
|
mteb/languages/language_scripts.py,sha256=5wix9HTYolNIpTiS5oXf2pGJyL7ftdGKs_m432w81V8,3998
|
|
1425
1425
|
mteb/languages/programming_languages.py,sha256=zxAakT3OSUnAuTnQ34VyeFIECnNXMlleZmAake6jsZE,211
|
|
1426
1426
|
mteb/leaderboard/__init__.py,sha256=991roXmtRwEQysV-37hWEzWpkvPgMCGRqZTHR-hm2io,88
|
|
1427
|
-
mteb/leaderboard/app.py,sha256=
|
|
1427
|
+
mteb/leaderboard/app.py,sha256=29MxFLKEVT-roULHG5boHmsQVhld1rDGNS94r7MWlz8,33118
|
|
1428
1428
|
mteb/leaderboard/benchmark_selector.py,sha256=uH66SI0iT1J4_fnebViWa83dQwhPi7toBv7PRL_epDw,7784
|
|
1429
|
-
mteb/leaderboard/figures.py,sha256=
|
|
1430
|
-
mteb/leaderboard/table.py,sha256=
|
|
1429
|
+
mteb/leaderboard/figures.py,sha256=cfOK82rRf-7sCjyP7GBxh4ezhOIt0OhD0_86mKtzLrg,7530
|
|
1430
|
+
mteb/leaderboard/table.py,sha256=6SnrYC5GcBlvVSO6vOk6ObuqtoveBLv3JUuXqdKueG8,8333
|
|
1431
1431
|
mteb/leaderboard/text_segments.py,sha256=iMIkS04QQjPbT-SkU0x6fOcS8xRbUYevryu9HydipKM,6570
|
|
1432
1432
|
mteb/models/__init__.py,sha256=ABTuoqiBjBtBWW3LYY7ItBHdylR6jWoy06HH0g6j6fU,910
|
|
1433
1433
|
mteb/models/abs_encoder.py,sha256=m0JkRfRPMYadDgBR9eozRloI31ZSWkSzDFINpwbfLZk,16533
|
|
@@ -1460,9 +1460,10 @@ mteb/models/model_implementations/bmretriever_models.py,sha256=ABfrACa028Dcujan7
|
|
|
1460
1460
|
mteb/models/model_implementations/cadet_models.py,sha256=bDula_VroXOWgSw-tquvNVGcGg7_Z1xHnoTDn6OGOYU,2225
|
|
1461
1461
|
mteb/models/model_implementations/cde_models.py,sha256=3nNU3nq3VZZcImFqH1VPj57-QJNMU6Ei2C_HCaicuUs,9012
|
|
1462
1462
|
mteb/models/model_implementations/clip_models.py,sha256=zrfgNmZszu0JMtMNdCMzEohixsrnQ7xFhCqgsiucH_Q,6107
|
|
1463
|
+
mteb/models/model_implementations/clips_models.py,sha256=QwwoU4Zu_zwUgUg7Hn2lzpXK-GjXIST0qF_2oRxHm2Y,3410
|
|
1463
1464
|
mteb/models/model_implementations/codefuse_models.py,sha256=19Y-d_qetVU64quzEvuUJ_K8DHo1JEEKEGqjRR48dFg,9113
|
|
1464
1465
|
mteb/models/model_implementations/codesage_models.py,sha256=D4CdISGyv5f2GMYq4_efgm5qNq80SWAX5R2u5mjEiXM,2998
|
|
1465
|
-
mteb/models/model_implementations/cohere_models.py,sha256=
|
|
1466
|
+
mteb/models/model_implementations/cohere_models.py,sha256=OWFClVAN4phjBoxfGGDyGDmzMu-t2VrjCGFyAIWmz4w,13832
|
|
1466
1467
|
mteb/models/model_implementations/cohere_v.py,sha256=K6VEw1NkyM2PuMd18kHE6aqPrcByYSwEmAKjvLods_w,15760
|
|
1467
1468
|
mteb/models/model_implementations/colpali_models.py,sha256=7PJ0SshVXasyncTfZRFIf_ZWzbqxJhhzNKAoGLhNktw,9004
|
|
1468
1469
|
mteb/models/model_implementations/colqwen_models.py,sha256=6upaxe19V8j5Ayu03Dgj5jPtC8SJBCITK_RionJRMSE,15545
|
|
@@ -1480,7 +1481,7 @@ mteb/models/model_implementations/evaclip_models.py,sha256=cPMGYLDIq4s8zJxb4vPXq
|
|
|
1480
1481
|
mteb/models/model_implementations/fa_models.py,sha256=WGal70_ezITWoNdjcMdbOCTSCtoaXzuPadYstLVXxhg,7478
|
|
1481
1482
|
mteb/models/model_implementations/geogpt_models.py,sha256=Juv86SwhgQX80lVLjAFtim2aSiJT1AcgjniyyiKyk1Q,1923
|
|
1482
1483
|
mteb/models/model_implementations/gme_v_models.py,sha256=NkfgR3_UdZzoBt1NnalVou6LOR-F7qXM4by9EbAVrys,13568
|
|
1483
|
-
mteb/models/model_implementations/google_models.py,sha256=
|
|
1484
|
+
mteb/models/model_implementations/google_models.py,sha256=7QfsaJ5JNDRQxFl7Zh2AtiR2PR7PZcfeCBgviuOFBCo,9130
|
|
1484
1485
|
mteb/models/model_implementations/granite_vision_embedding_models.py,sha256=uqQ5-e_a-ADv3gf3sR9Drk0S4x8Gy8mZkpL-E4X16TM,7241
|
|
1485
1486
|
mteb/models/model_implementations/gritlm_models.py,sha256=aS_CuioL95JAQMYiaKlGuAWU9wZjabn268Xut3bD8-w,3005
|
|
1486
1487
|
mteb/models/model_implementations/gte_models.py,sha256=o26Xyu_tucUlP435Q_jB4-bl0xckgj4wtbutTwhYgIo,10073
|
|
@@ -1492,9 +1493,10 @@ mteb/models/model_implementations/jasper_models.py,sha256=ZY7qRRpBpD3eVryQb4rLs5
|
|
|
1492
1493
|
mteb/models/model_implementations/jina_clip.py,sha256=CfiIxbhKspjQajNtObCfGPHOWPk6uLn4cuwydQHFTMo,5118
|
|
1493
1494
|
mteb/models/model_implementations/jina_models.py,sha256=HrHm2Io3g9gHwxU5icAaudy_E8rAVkAAIFSzVYWF-dM,34859
|
|
1494
1495
|
mteb/models/model_implementations/kalm_models.py,sha256=FmW7Z5Qs6WYBLuKvql3u4IJW36kj4k-Ypah8qTBEBkg,59837
|
|
1496
|
+
mteb/models/model_implementations/kennethenevoldsen_models.py,sha256=DF-9nmsewYO9ikZ0kV81ujKGr7Ot36-9iPoxN7KX2mY,2993
|
|
1495
1497
|
mteb/models/model_implementations/lens_models.py,sha256=fC7_NB1F8vBAlXD0p0-hALf6eZTPFJwpz57dy71OlwI,1696
|
|
1496
1498
|
mteb/models/model_implementations/lgai_embedding_models.py,sha256=S83pbfkMH3YUNl4skusgbK-Rn-uLuScQVxgXwegR_N4,2333
|
|
1497
|
-
mteb/models/model_implementations/linq_models.py,sha256=
|
|
1499
|
+
mteb/models/model_implementations/linq_models.py,sha256=EtvUyiNbjU-GJd1kS0Z0gBACkP2pFOjk0KfGMZz4K9Y,1872
|
|
1498
1500
|
mteb/models/model_implementations/listconranker.py,sha256=pFISrZ91NHsnhc5El5U_ZPsB9cSTuTY8-nDzpoNMC9s,4485
|
|
1499
1501
|
mteb/models/model_implementations/llm2clip_models.py,sha256=_sqAOb5oSbxn1oaXjWwPXRjTvxLT48xXL_tuabt2Ks0,9265
|
|
1500
1502
|
mteb/models/model_implementations/llm2vec_models.py,sha256=Og_EqnOXgIfaTcVTl3Lj5BicG83ycnXS_YHNtK63I-A,12638
|
|
@@ -2567,9 +2569,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2567
2569
|
mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
|
|
2568
2570
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2569
2571
|
mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
|
|
2570
|
-
mteb-2.3.
|
|
2571
|
-
mteb-2.3.
|
|
2572
|
-
mteb-2.3.
|
|
2573
|
-
mteb-2.3.
|
|
2574
|
-
mteb-2.3.
|
|
2575
|
-
mteb-2.3.
|
|
2572
|
+
mteb-2.3.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2573
|
+
mteb-2.3.3.dist-info/METADATA,sha256=LbvRqywjhaqAK4910G8ueME52YrrqFzvm4NXl2M3MBA,13923
|
|
2574
|
+
mteb-2.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2575
|
+
mteb-2.3.3.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2576
|
+
mteb-2.3.3.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2577
|
+
mteb-2.3.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|