mteb 2.6.6__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +7 -3
- mteb/_evaluators/any_sts_evaluator.py +6 -3
- mteb/_evaluators/clustering_evaluator.py +2 -2
- mteb/_evaluators/evaluator.py +2 -1
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -5
- mteb/_evaluators/pair_classification_evaluator.py +2 -2
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/sklearn_evaluator.py +3 -3
- mteb/_evaluators/text/bitext_mining_evaluator.py +5 -3
- mteb/_evaluators/text/summarization_evaluator.py +3 -2
- mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
- mteb/abstasks/abstask.py +3 -2
- mteb/abstasks/aggregated_task.py +3 -3
- mteb/abstasks/classification.py +3 -3
- mteb/abstasks/clustering.py +2 -2
- mteb/abstasks/clustering_legacy.py +2 -2
- mteb/abstasks/image/image_text_pair_classification.py +2 -1
- mteb/abstasks/multilabel_classification.py +2 -2
- mteb/abstasks/pair_classification.py +2 -2
- mteb/abstasks/retrieval.py +15 -14
- mteb/abstasks/sts.py +2 -2
- mteb/abstasks/text/bitext_mining.py +3 -3
- mteb/abstasks/text/summarization.py +2 -2
- mteb/abstasks/zeroshot_classification.py +3 -2
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +24 -0
- mteb/cli/build_cli.py +2 -1
- mteb/deprecated_evaluator.py +3 -3
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/evaluate.py +5 -3
- mteb/models/abs_encoder.py +3 -1
- mteb/models/instruct_wrapper.py +1 -1
- mteb/models/model_implementations/bm25.py +3 -3
- mteb/models/model_implementations/jina_clip.py +46 -8
- mteb/models/model_implementations/mxbai_models.py +118 -1
- mteb/models/model_implementations/nvidia_models.py +73 -5
- mteb/models/model_implementations/octen_models.py +30 -0
- mteb/models/model_implementations/pylate_models.py +5 -4
- mteb/models/model_implementations/sentence_transformers_models.py +66 -0
- mteb/models/models_protocols.py +6 -4
- mteb/models/search_wrappers.py +7 -6
- mteb/models/sentence_transformer_wrapper.py +5 -4
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +12 -0
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/METADATA +1 -1
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/RECORD +55 -50
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/WHEEL +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/entry_points.txt +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/top_level.txt +0 -0
|
@@ -2728,3 +2728,27 @@ JMTEB_LITE_V1 = Benchmark(
|
|
|
2728
2728
|
""",
|
|
2729
2729
|
contacts=["lsz05"],
|
|
2730
2730
|
)
|
|
2731
|
+
|
|
2732
|
+
KOVIDORE_V2 = Benchmark(
|
|
2733
|
+
name="KoViDoRe(v2)",
|
|
2734
|
+
display_name="KoViDoRe v2",
|
|
2735
|
+
tasks=get_tasks(
|
|
2736
|
+
tasks=[
|
|
2737
|
+
"KoVidore2CybersecurityRetrieval",
|
|
2738
|
+
"KoVidore2EconomicRetrieval",
|
|
2739
|
+
"KoVidore2EnergyRetrieval",
|
|
2740
|
+
"KoVidore2HrRetrieval",
|
|
2741
|
+
]
|
|
2742
|
+
),
|
|
2743
|
+
description="KoViDoRe v2 sets a new industry gold standard for multi-modal, enterprise document visual retrieval evaluation. It addresses a critical challenge in production RAG systems: retrieving accurate information from complex, visually-rich documents.",
|
|
2744
|
+
reference="https://github.com/whybe-choi/kovidore-data-generator",
|
|
2745
|
+
citation=r"""
|
|
2746
|
+
@misc{choi2026kovidorev2,
|
|
2747
|
+
author = {Yongbin Choi},
|
|
2748
|
+
note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains},
|
|
2749
|
+
title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases},
|
|
2750
|
+
url = {https://github.com/whybe-choi/kovidore-data-generator},
|
|
2751
|
+
year = {2026},
|
|
2752
|
+
}
|
|
2753
|
+
""",
|
|
2754
|
+
)
|
mteb/cli/build_cli.py
CHANGED
|
@@ -13,6 +13,7 @@ from mteb.cache import ResultCache
|
|
|
13
13
|
from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
|
|
14
14
|
from mteb.cli.generate_model_card import generate_model_card
|
|
15
15
|
from mteb.evaluate import OverwriteStrategy
|
|
16
|
+
from mteb.types._encoder_io import EncodeKwargs
|
|
16
17
|
|
|
17
18
|
logger = logging.getLogger(__name__)
|
|
18
19
|
|
|
@@ -64,7 +65,7 @@ def run(args: argparse.Namespace) -> None:
|
|
|
64
65
|
eval_splits=args.eval_splits,
|
|
65
66
|
)
|
|
66
67
|
|
|
67
|
-
encode_kwargs = {}
|
|
68
|
+
encode_kwargs: EncodeKwargs = {}
|
|
68
69
|
if args.batch_size is not None:
|
|
69
70
|
encode_kwargs["batch_size"] = args.batch_size
|
|
70
71
|
|
mteb/deprecated_evaluator.py
CHANGED
|
@@ -28,7 +28,7 @@ from mteb.models import (
|
|
|
28
28
|
SentenceTransformerEncoderWrapper,
|
|
29
29
|
)
|
|
30
30
|
from mteb.results import TaskResult
|
|
31
|
-
from mteb.types import ScoresDict
|
|
31
|
+
from mteb.types import EncodeKwargs, ScoresDict
|
|
32
32
|
|
|
33
33
|
if sys.version_info >= (3, 13):
|
|
34
34
|
from warnings import deprecated
|
|
@@ -174,7 +174,7 @@ class MTEB:
|
|
|
174
174
|
split: str,
|
|
175
175
|
subsets_to_run: list[str] | None = None,
|
|
176
176
|
*,
|
|
177
|
-
encode_kwargs:
|
|
177
|
+
encode_kwargs: EncodeKwargs,
|
|
178
178
|
**kwargs: Any,
|
|
179
179
|
):
|
|
180
180
|
tick = time()
|
|
@@ -263,7 +263,7 @@ class MTEB:
|
|
|
263
263
|
overwrite_results: bool = False,
|
|
264
264
|
raise_error: bool = True,
|
|
265
265
|
co2_tracker: bool = False,
|
|
266
|
-
encode_kwargs:
|
|
266
|
+
encode_kwargs: EncodeKwargs | None = None,
|
|
267
267
|
**kwargs,
|
|
268
268
|
) -> list[TaskResult]:
|
|
269
269
|
"""Run the evaluation pipeline on the selected tasks.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 1299,
|
|
4
|
+
"number_of_characters": 9254,
|
|
5
|
+
"documents_text_statistics": null,
|
|
6
|
+
"documents_image_statistics": {
|
|
7
|
+
"min_image_width": 2245,
|
|
8
|
+
"average_image_width": 2370.324347826087,
|
|
9
|
+
"max_image_width": 3508,
|
|
10
|
+
"min_image_height": 2481,
|
|
11
|
+
"average_image_height": 3289.8060869565215,
|
|
12
|
+
"max_image_height": 3580,
|
|
13
|
+
"unique_images": 1132
|
|
14
|
+
},
|
|
15
|
+
"queries_text_statistics": {
|
|
16
|
+
"total_text_length": 9254,
|
|
17
|
+
"min_text_length": 15,
|
|
18
|
+
"average_text_length": 62.10738255033557,
|
|
19
|
+
"max_text_length": 108,
|
|
20
|
+
"unique_texts": 149
|
|
21
|
+
},
|
|
22
|
+
"queries_image_statistics": null,
|
|
23
|
+
"relevant_docs_statistics": {
|
|
24
|
+
"num_relevant_docs": 409,
|
|
25
|
+
"min_relevant_docs_per_query": 1,
|
|
26
|
+
"average_relevant_docs_per_query": 2.7449664429530203,
|
|
27
|
+
"max_relevant_docs_per_query": 7,
|
|
28
|
+
"unique_relevant_docs": 316
|
|
29
|
+
},
|
|
30
|
+
"top_ranked_statistics": null
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 1640,
|
|
4
|
+
"number_of_characters": 8331,
|
|
5
|
+
"documents_text_statistics": null,
|
|
6
|
+
"documents_image_statistics": {
|
|
7
|
+
"min_image_width": 2313,
|
|
8
|
+
"average_image_width": 2347.5321597833445,
|
|
9
|
+
"max_image_width": 2481,
|
|
10
|
+
"min_image_height": 3138,
|
|
11
|
+
"average_image_height": 3214.301963439404,
|
|
12
|
+
"max_image_height": 3508,
|
|
13
|
+
"unique_images": 1442
|
|
14
|
+
},
|
|
15
|
+
"queries_text_statistics": {
|
|
16
|
+
"total_text_length": 8331,
|
|
17
|
+
"min_text_length": 23,
|
|
18
|
+
"average_text_length": 51.11042944785276,
|
|
19
|
+
"max_text_length": 110,
|
|
20
|
+
"unique_texts": 163
|
|
21
|
+
},
|
|
22
|
+
"queries_image_statistics": null,
|
|
23
|
+
"relevant_docs_statistics": {
|
|
24
|
+
"num_relevant_docs": 413,
|
|
25
|
+
"min_relevant_docs_per_query": 1,
|
|
26
|
+
"average_relevant_docs_per_query": 2.5337423312883436,
|
|
27
|
+
"max_relevant_docs_per_query": 6,
|
|
28
|
+
"unique_relevant_docs": 349
|
|
29
|
+
},
|
|
30
|
+
"top_ranked_statistics": null
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 2166,
|
|
4
|
+
"number_of_characters": 9764,
|
|
5
|
+
"documents_text_statistics": null,
|
|
6
|
+
"documents_image_statistics": {
|
|
7
|
+
"min_image_width": 2221,
|
|
8
|
+
"average_image_width": 2339.4957350727545,
|
|
9
|
+
"max_image_width": 2480,
|
|
10
|
+
"min_image_height": 3036,
|
|
11
|
+
"average_image_height": 3242.8138484696437,
|
|
12
|
+
"max_image_height": 3508,
|
|
13
|
+
"unique_images": 1974
|
|
14
|
+
},
|
|
15
|
+
"queries_text_statistics": {
|
|
16
|
+
"total_text_length": 9764,
|
|
17
|
+
"min_text_length": 22,
|
|
18
|
+
"average_text_length": 56.4393063583815,
|
|
19
|
+
"max_text_length": 103,
|
|
20
|
+
"unique_texts": 173
|
|
21
|
+
},
|
|
22
|
+
"queries_image_statistics": null,
|
|
23
|
+
"relevant_docs_statistics": {
|
|
24
|
+
"num_relevant_docs": 525,
|
|
25
|
+
"min_relevant_docs_per_query": 1,
|
|
26
|
+
"average_relevant_docs_per_query": 3.0346820809248554,
|
|
27
|
+
"max_relevant_docs_per_query": 7,
|
|
28
|
+
"unique_relevant_docs": 442
|
|
29
|
+
},
|
|
30
|
+
"top_ranked_statistics": null
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 2330,
|
|
4
|
+
"number_of_characters": 13131,
|
|
5
|
+
"documents_text_statistics": null,
|
|
6
|
+
"documents_image_statistics": {
|
|
7
|
+
"min_image_width": 1949,
|
|
8
|
+
"average_image_width": 2430.1152204836417,
|
|
9
|
+
"max_image_width": 3505,
|
|
10
|
+
"min_image_height": 2480,
|
|
11
|
+
"average_image_height": 3350.3921289710765,
|
|
12
|
+
"max_image_height": 3626,
|
|
13
|
+
"unique_images": 2096
|
|
14
|
+
},
|
|
15
|
+
"queries_text_statistics": {
|
|
16
|
+
"total_text_length": 13131,
|
|
17
|
+
"min_text_length": 21,
|
|
18
|
+
"average_text_length": 59.41628959276018,
|
|
19
|
+
"max_text_length": 112,
|
|
20
|
+
"unique_texts": 221
|
|
21
|
+
},
|
|
22
|
+
"queries_image_statistics": null,
|
|
23
|
+
"relevant_docs_statistics": {
|
|
24
|
+
"num_relevant_docs": 726,
|
|
25
|
+
"min_relevant_docs_per_query": 1,
|
|
26
|
+
"average_relevant_docs_per_query": 3.2850678733031673,
|
|
27
|
+
"max_relevant_docs_per_query": 7,
|
|
28
|
+
"unique_relevant_docs": 575
|
|
29
|
+
},
|
|
30
|
+
"top_ranked_statistics": null
|
|
31
|
+
}
|
|
32
|
+
}
|
mteb/evaluate.py
CHANGED
|
@@ -5,7 +5,7 @@ import warnings
|
|
|
5
5
|
from collections.abc import Iterable
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from time import time
|
|
8
|
-
from typing import TYPE_CHECKING,
|
|
8
|
+
from typing import TYPE_CHECKING, cast
|
|
9
9
|
|
|
10
10
|
from datasets.exceptions import DatasetNotFoundError
|
|
11
11
|
from tqdm.auto import tqdm
|
|
@@ -27,6 +27,7 @@ from mteb.models.sentence_transformer_wrapper import (
|
|
|
27
27
|
from mteb.results import ModelResult, TaskResult
|
|
28
28
|
from mteb.results.task_result import TaskError
|
|
29
29
|
from mteb.types import HFSubset, PromptType, SplitName
|
|
30
|
+
from mteb.types._encoder_io import EncodeKwargs
|
|
30
31
|
from mteb.types._metadata import ModelName, Revision
|
|
31
32
|
|
|
32
33
|
if TYPE_CHECKING:
|
|
@@ -85,9 +86,10 @@ def _evaluate_task(
|
|
|
85
86
|
*,
|
|
86
87
|
splits: dict[SplitName, list[HFSubset]],
|
|
87
88
|
co2_tracker: bool | None,
|
|
88
|
-
encode_kwargs:
|
|
89
|
+
encode_kwargs: EncodeKwargs,
|
|
89
90
|
prediction_folder: Path | None,
|
|
90
91
|
public_only: bool | None,
|
|
92
|
+
num_proc: int = 1,
|
|
91
93
|
) -> TaskResult | TaskError:
|
|
92
94
|
"""The core logic to run a model on a given task. See `evaluate` for more details.
|
|
93
95
|
|
|
@@ -270,7 +272,7 @@ def evaluate(
|
|
|
270
272
|
*,
|
|
271
273
|
co2_tracker: bool | None = None,
|
|
272
274
|
raise_error: bool = True,
|
|
273
|
-
encode_kwargs:
|
|
275
|
+
encode_kwargs: EncodeKwargs | None = None,
|
|
274
276
|
cache: ResultCache | None = ResultCache(),
|
|
275
277
|
overwrite_strategy: str | OverwriteStrategy = "only-missing",
|
|
276
278
|
prediction_folder: Path | str | None = None,
|
mteb/models/abs_encoder.py
CHANGED
|
@@ -5,6 +5,7 @@ from collections.abc import Callable, Sequence
|
|
|
5
5
|
from typing import Any, Literal, cast, get_args, overload
|
|
6
6
|
|
|
7
7
|
from torch.utils.data import DataLoader
|
|
8
|
+
from typing_extensions import Unpack
|
|
8
9
|
|
|
9
10
|
import mteb
|
|
10
11
|
from mteb.abstasks.task_metadata import TaskMetadata, TaskType
|
|
@@ -19,6 +20,7 @@ from mteb.similarity_functions import (
|
|
|
19
20
|
from mteb.types import (
|
|
20
21
|
Array,
|
|
21
22
|
BatchedInput,
|
|
23
|
+
EncodeKwargs,
|
|
22
24
|
PromptType,
|
|
23
25
|
)
|
|
24
26
|
|
|
@@ -370,7 +372,7 @@ class AbsEncoder(ABC):
|
|
|
370
372
|
hf_split: str,
|
|
371
373
|
hf_subset: str,
|
|
372
374
|
prompt_type: PromptType | None = None,
|
|
373
|
-
**kwargs:
|
|
375
|
+
**kwargs: Unpack[EncodeKwargs],
|
|
374
376
|
) -> Array:
|
|
375
377
|
"""Encodes the given sentences using the encoder.
|
|
376
378
|
|
mteb/models/instruct_wrapper.py
CHANGED
|
@@ -92,7 +92,7 @@ def instruct_wrapper(
|
|
|
92
92
|
logger.info(
|
|
93
93
|
f"Using instruction: '{instruction}' for task: '{task_metadata.name}'"
|
|
94
94
|
)
|
|
95
|
-
embeddings = super().encode( # type: ignore[safe-super]
|
|
95
|
+
embeddings = super().encode( # type: ignore[safe-super,call-arg]
|
|
96
96
|
_inputs, # type: ignore[arg-type]
|
|
97
97
|
instruction=instruction,
|
|
98
98
|
*args,
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any
|
|
3
2
|
|
|
4
3
|
from mteb._create_dataloaders import _create_text_queries_dataloader
|
|
5
4
|
from mteb._requires_package import requires_package
|
|
@@ -8,6 +7,7 @@ from mteb.models.model_meta import ModelMeta
|
|
|
8
7
|
from mteb.models.models_protocols import SearchProtocol
|
|
9
8
|
from mteb.types import (
|
|
10
9
|
CorpusDatasetType,
|
|
10
|
+
EncodeKwargs,
|
|
11
11
|
InstructionDatasetType,
|
|
12
12
|
QueryDatasetType,
|
|
13
13
|
RetrievalOutputType,
|
|
@@ -49,7 +49,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
|
|
|
49
49
|
task_metadata: TaskMetadata,
|
|
50
50
|
hf_split: str,
|
|
51
51
|
hf_subset: str,
|
|
52
|
-
encode_kwargs:
|
|
52
|
+
encode_kwargs: EncodeKwargs,
|
|
53
53
|
) -> None:
|
|
54
54
|
logger.info("Encoding Corpus...")
|
|
55
55
|
corpus_texts = [
|
|
@@ -74,7 +74,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
|
|
|
74
74
|
hf_split: str,
|
|
75
75
|
hf_subset: str,
|
|
76
76
|
top_k: int,
|
|
77
|
-
encode_kwargs:
|
|
77
|
+
encode_kwargs: EncodeKwargs,
|
|
78
78
|
instructions: InstructionDatasetType | None = None,
|
|
79
79
|
top_ranked: TopRankedDocumentsType | None = None,
|
|
80
80
|
) -> RetrievalOutputType:
|
|
@@ -7,6 +7,7 @@ from tqdm.auto import tqdm
|
|
|
7
7
|
from mteb._requires_package import requires_image_dependencies
|
|
8
8
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
|
+
from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
|
|
10
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
12
|
from mteb.types import Array, BatchedInput, PromptType
|
|
12
13
|
|
|
@@ -120,6 +121,15 @@ class JinaCLIPModel(AbsEncoder):
|
|
|
120
121
|
raise ValueError
|
|
121
122
|
|
|
122
123
|
|
|
124
|
+
_JINA_CLIP_TRAIN_DATASETS_V1 = {
|
|
125
|
+
# LAION400M
|
|
126
|
+
# ShareGPT4V
|
|
127
|
+
"MSMARCO",
|
|
128
|
+
"NQ",
|
|
129
|
+
"HotpotQA",
|
|
130
|
+
# Natural Language Inference (NLI) dataset (Bowman et al., 2015)
|
|
131
|
+
}
|
|
132
|
+
|
|
123
133
|
jina_clip_v1 = ModelMeta(
|
|
124
134
|
loader=JinaCLIPModel,
|
|
125
135
|
name="jinaai/jina-clip-v1",
|
|
@@ -140,13 +150,41 @@ jina_clip_v1 = ModelMeta(
|
|
|
140
150
|
reference="https://huggingface.co/jinaai/jina-clip-v1",
|
|
141
151
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
142
152
|
use_instructions=True,
|
|
143
|
-
training_datasets=
|
|
144
|
-
# LAION400M
|
|
145
|
-
# ShareGPT4V
|
|
146
|
-
"MSMARCO",
|
|
147
|
-
# NQ
|
|
148
|
-
# HotpotQA
|
|
149
|
-
# Natural Language Inference (NLI) dataset (Bowman et al., 2015)
|
|
150
|
-
},
|
|
153
|
+
training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1,
|
|
151
154
|
citation=JINA_CLIP_CITATION,
|
|
155
|
+
superseded_by="jinaai/jina-clip-v2",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
jina_clip_v2 = ModelMeta(
|
|
159
|
+
loader=JinaCLIPModel,
|
|
160
|
+
name="jinaai/jina-clip-v2",
|
|
161
|
+
revision="344d954da76eb8ad47a7aaff42d012e30c15b8fe",
|
|
162
|
+
release_date="2024-10-09",
|
|
163
|
+
languages=["eng-Latn"],
|
|
164
|
+
n_parameters=865278477,
|
|
165
|
+
memory_usage_mb=1650.0,
|
|
166
|
+
max_tokens=8192,
|
|
167
|
+
embed_dim=1024,
|
|
168
|
+
license="cc-by-nc-4.0",
|
|
169
|
+
open_weights=True,
|
|
170
|
+
public_training_code=None,
|
|
171
|
+
public_training_data=None,
|
|
172
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
173
|
+
reference="https://huggingface.co/jinaai/jina-clip-v2",
|
|
174
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
175
|
+
use_instructions=False,
|
|
176
|
+
training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1 | COLPALI_TRAINING_DATA,
|
|
177
|
+
modalities=["text", "image"],
|
|
178
|
+
model_type=["dense"],
|
|
179
|
+
citation="""
|
|
180
|
+
@misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
|
|
181
|
+
title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
|
|
182
|
+
author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
|
|
183
|
+
year={2024},
|
|
184
|
+
eprint={2412.08802},
|
|
185
|
+
archivePrefix={arXiv},
|
|
186
|
+
primaryClass={cs.CL},
|
|
187
|
+
url={https://arxiv.org/abs/2412.08802},
|
|
188
|
+
}
|
|
189
|
+
""",
|
|
152
190
|
)
|
|
@@ -2,7 +2,10 @@ from mteb.models.model_meta import (
|
|
|
2
2
|
ModelMeta,
|
|
3
3
|
ScoringFunction,
|
|
4
4
|
)
|
|
5
|
-
from mteb.models.sentence_transformer_wrapper import
|
|
5
|
+
from mteb.models.sentence_transformer_wrapper import (
|
|
6
|
+
CrossEncoderWrapper,
|
|
7
|
+
sentence_transformers_loader,
|
|
8
|
+
)
|
|
6
9
|
|
|
7
10
|
mixedbread_training_data = {
|
|
8
11
|
# from correspondence:
|
|
@@ -122,3 +125,117 @@ mxbai_embed_xsmall_v1 = ModelMeta(
|
|
|
122
125
|
url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
|
|
123
126
|
}""",
|
|
124
127
|
)
|
|
128
|
+
|
|
129
|
+
mxbai_rerank_xsmall_v1 = ModelMeta(
|
|
130
|
+
loader=CrossEncoderWrapper,
|
|
131
|
+
name="mixedbread-ai/mxbai-rerank-xsmall-v1",
|
|
132
|
+
revision="b5c6e9da73abc3711f593f705371cdbe9e0fe422",
|
|
133
|
+
release_date="2024-02-29",
|
|
134
|
+
languages=["eng-Latn"],
|
|
135
|
+
n_parameters=70830337,
|
|
136
|
+
memory_usage_mb=135.0,
|
|
137
|
+
max_tokens=512,
|
|
138
|
+
embed_dim=None,
|
|
139
|
+
license="apache-2.0",
|
|
140
|
+
open_weights=True,
|
|
141
|
+
public_training_code=None,
|
|
142
|
+
public_training_data=None,
|
|
143
|
+
framework=[
|
|
144
|
+
"PyTorch",
|
|
145
|
+
"Sentence Transformers",
|
|
146
|
+
"Transformers",
|
|
147
|
+
"ONNX",
|
|
148
|
+
"safetensors",
|
|
149
|
+
],
|
|
150
|
+
reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-xsmall-v1",
|
|
151
|
+
similarity_fn_name=None,
|
|
152
|
+
use_instructions=None,
|
|
153
|
+
training_datasets=None,
|
|
154
|
+
adapted_from=None,
|
|
155
|
+
superseded_by=None,
|
|
156
|
+
modalities=["text"],
|
|
157
|
+
model_type=["cross-encoder"],
|
|
158
|
+
citation="""@online{rerank2024mxbai,
|
|
159
|
+
title={Boost Your Search With The Crispy Mixedbread Rerank Models},
|
|
160
|
+
author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
|
|
161
|
+
year={2024},
|
|
162
|
+
url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
|
|
163
|
+
}""",
|
|
164
|
+
contacts=None,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
mxbai_rerank_base_v1 = ModelMeta(
|
|
168
|
+
loader=CrossEncoderWrapper,
|
|
169
|
+
name="mixedbread-ai/mxbai-rerank-base-v1",
|
|
170
|
+
revision="800f24c113213a187e65bde9db00c15a2bb12738",
|
|
171
|
+
release_date="2024-02-29",
|
|
172
|
+
languages=["eng-Latn"],
|
|
173
|
+
n_parameters=184422913,
|
|
174
|
+
memory_usage_mb=352.0,
|
|
175
|
+
max_tokens=512,
|
|
176
|
+
embed_dim=None,
|
|
177
|
+
license="apache-2.0",
|
|
178
|
+
open_weights=True,
|
|
179
|
+
public_training_code=None,
|
|
180
|
+
public_training_data=None,
|
|
181
|
+
framework=[
|
|
182
|
+
"PyTorch",
|
|
183
|
+
"Sentence Transformers",
|
|
184
|
+
"Transformers",
|
|
185
|
+
"ONNX",
|
|
186
|
+
"safetensors",
|
|
187
|
+
],
|
|
188
|
+
reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1",
|
|
189
|
+
similarity_fn_name=None,
|
|
190
|
+
use_instructions=None,
|
|
191
|
+
training_datasets=None,
|
|
192
|
+
adapted_from=None,
|
|
193
|
+
superseded_by=None,
|
|
194
|
+
modalities=["text"],
|
|
195
|
+
model_type=["cross-encoder"],
|
|
196
|
+
citation="""@online{rerank2024mxbai,
|
|
197
|
+
title={Boost Your Search With The Crispy Mixedbread Rerank Models},
|
|
198
|
+
author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
|
|
199
|
+
year={2024},
|
|
200
|
+
url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
|
|
201
|
+
}""",
|
|
202
|
+
contacts=None,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
mxbai_rerank_large_v1 = ModelMeta(
|
|
206
|
+
loader=CrossEncoderWrapper,
|
|
207
|
+
name="mixedbread-ai/mxbai-rerank-large-v1",
|
|
208
|
+
revision="98f655841d5caf0b16eaff79c2b4ca109d920d17",
|
|
209
|
+
release_date="2024-02-29",
|
|
210
|
+
languages=["eng-Latn"],
|
|
211
|
+
n_parameters=435062785,
|
|
212
|
+
memory_usage_mb=830.0,
|
|
213
|
+
max_tokens=512,
|
|
214
|
+
embed_dim=None,
|
|
215
|
+
license="apache-2.0",
|
|
216
|
+
open_weights=True,
|
|
217
|
+
public_training_code=None,
|
|
218
|
+
public_training_data=None,
|
|
219
|
+
framework=[
|
|
220
|
+
"PyTorch",
|
|
221
|
+
"Sentence Transformers",
|
|
222
|
+
"Transformers",
|
|
223
|
+
"ONNX",
|
|
224
|
+
"safetensors",
|
|
225
|
+
],
|
|
226
|
+
reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-large-v1",
|
|
227
|
+
similarity_fn_name=None,
|
|
228
|
+
use_instructions=None,
|
|
229
|
+
training_datasets=None,
|
|
230
|
+
adapted_from=None,
|
|
231
|
+
superseded_by=None,
|
|
232
|
+
modalities=["text"],
|
|
233
|
+
model_type=["cross-encoder"],
|
|
234
|
+
citation="""@online{rerank2024mxbai,
|
|
235
|
+
title={Boost Your Search With The Crispy Mixedbread Rerank Models},
|
|
236
|
+
author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
|
|
237
|
+
year={2024},
|
|
238
|
+
url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
|
|
239
|
+
}""",
|
|
240
|
+
contacts=None,
|
|
241
|
+
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Callable
|
|
2
3
|
from typing import Any
|
|
3
4
|
|
|
4
5
|
import torch
|
|
@@ -29,7 +30,7 @@ NV_RETRIEVER_CITATION = """@misc{moreira2025nvretrieverimprovingtextembedding,
|
|
|
29
30
|
}"""
|
|
30
31
|
|
|
31
32
|
|
|
32
|
-
def
|
|
33
|
+
def _instruction_template(
|
|
33
34
|
instruction: str, prompt_type: PromptType | None = None
|
|
34
35
|
) -> str:
|
|
35
36
|
return f"Instruct: {instruction}\nQuery: " if instruction else ""
|
|
@@ -100,10 +101,77 @@ nvidia_training_datasets = {
|
|
|
100
101
|
"MrTidyRetrieval",
|
|
101
102
|
}
|
|
102
103
|
|
|
104
|
+
|
|
105
|
+
class _NVEmbedWrapper(InstructSentenceTransformerModel):
|
|
106
|
+
"""Inherited, because nvembed requires `sbert==2`, but it doesn't have tokenizers kwargs"""
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
model_name: str,
|
|
111
|
+
revision: str,
|
|
112
|
+
instruction_template: str
|
|
113
|
+
| Callable[[str, PromptType | None], str]
|
|
114
|
+
| None = None,
|
|
115
|
+
max_seq_length: int | None = None,
|
|
116
|
+
apply_instruction_to_passages: bool = True,
|
|
117
|
+
padding_side: str | None = None,
|
|
118
|
+
add_eos_token: bool = False,
|
|
119
|
+
prompts_dict: dict[str, str] | None = None,
|
|
120
|
+
**kwargs: Any,
|
|
121
|
+
):
|
|
122
|
+
from sentence_transformers import __version__ as sbert_version
|
|
123
|
+
|
|
124
|
+
required_transformers_version = "4.42.4"
|
|
125
|
+
required_sbert_version = "2.7.0"
|
|
126
|
+
|
|
127
|
+
if Version(transformers_version) != Version(required_transformers_version):
|
|
128
|
+
raise RuntimeError(
|
|
129
|
+
f"transformers version {transformers_version} is not match with required "
|
|
130
|
+
f"install version {required_transformers_version} to run `nvidia/NV-Embed-v2`"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if Version(sbert_version) != Version(required_sbert_version):
|
|
134
|
+
raise RuntimeError(
|
|
135
|
+
f"sbert version {sbert_version} is not match with required "
|
|
136
|
+
f"install version {required_sbert_version} to run `nvidia/NV-Embed-v2`"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
requires_package(
|
|
140
|
+
self, "flash_attn", model_name, "pip install 'mteb[flash_attention]'"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
from sentence_transformers import SentenceTransformer
|
|
144
|
+
|
|
145
|
+
if (
|
|
146
|
+
isinstance(instruction_template, str)
|
|
147
|
+
and "{instruction}" not in instruction_template
|
|
148
|
+
):
|
|
149
|
+
raise ValueError(
|
|
150
|
+
"Instruction template must contain the string '{instruction}'."
|
|
151
|
+
)
|
|
152
|
+
if instruction_template is None:
|
|
153
|
+
logger.warning(
|
|
154
|
+
"No instruction template provided. Instructions will be used as-is."
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
self.instruction_template = instruction_template
|
|
158
|
+
|
|
159
|
+
self.model_name = model_name
|
|
160
|
+
self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
|
|
161
|
+
self.model.tokenizer.padding_side = padding_side
|
|
162
|
+
self.model.tokenizer.add_eos_token = add_eos_token
|
|
163
|
+
|
|
164
|
+
if max_seq_length:
|
|
165
|
+
# https://github.com/huggingface/sentence-transformers/issues/3575
|
|
166
|
+
self.model.max_seq_length = max_seq_length
|
|
167
|
+
self.apply_instruction_to_passages = apply_instruction_to_passages
|
|
168
|
+
self.prompts_dict = prompts_dict
|
|
169
|
+
|
|
170
|
+
|
|
103
171
|
NV_embed_v2 = ModelMeta(
|
|
104
|
-
loader=
|
|
172
|
+
loader=_NVEmbedWrapper,
|
|
105
173
|
loader_kwargs=dict(
|
|
106
|
-
instruction_template=
|
|
174
|
+
instruction_template=_instruction_template,
|
|
107
175
|
trust_remote_code=True,
|
|
108
176
|
max_seq_length=32768,
|
|
109
177
|
padding_side="right",
|
|
@@ -132,9 +200,9 @@ NV_embed_v2 = ModelMeta(
|
|
|
132
200
|
)
|
|
133
201
|
|
|
134
202
|
NV_embed_v1 = ModelMeta(
|
|
135
|
-
loader=
|
|
203
|
+
loader=_NVEmbedWrapper,
|
|
136
204
|
loader_kwargs=dict(
|
|
137
|
-
instruction_template=
|
|
205
|
+
instruction_template=_instruction_template,
|
|
138
206
|
trust_remote_code=True,
|
|
139
207
|
max_seq_length=32768,
|
|
140
208
|
padding_side="right",
|
|
@@ -163,6 +163,36 @@ _PREDEFINED_PROMPTS = {
|
|
|
163
163
|
"German1Retrieval": "Given a query, retrieve relevant passages",
|
|
164
164
|
}
|
|
165
165
|
|
|
166
|
+
Octen_Embedding_0B6 = ModelMeta(
|
|
167
|
+
loader=InstructSentenceTransformerModel,
|
|
168
|
+
loader_kwargs=dict(
|
|
169
|
+
instruction_template=instruction_template,
|
|
170
|
+
apply_instruction_to_passages=True,
|
|
171
|
+
prompts_dict=_PREDEFINED_PROMPTS,
|
|
172
|
+
max_seq_length=18480,
|
|
173
|
+
model_kwargs={"torch_dtype": "bfloat16"},
|
|
174
|
+
),
|
|
175
|
+
name="bflhc/Octen-Embedding-0.6B",
|
|
176
|
+
languages=multilingual_langs,
|
|
177
|
+
open_weights=True,
|
|
178
|
+
revision="1a00a4e837bd788f6f8d91bc43201a5e52cf8ef8",
|
|
179
|
+
release_date="2026-01-10",
|
|
180
|
+
n_parameters=595776512,
|
|
181
|
+
memory_usage_mb=1136,
|
|
182
|
+
embed_dim=1024,
|
|
183
|
+
max_tokens=32768,
|
|
184
|
+
license="apache-2.0",
|
|
185
|
+
reference="https://huggingface.co/bflhc/Octen-Embedding-0.6B",
|
|
186
|
+
similarity_fn_name="cosine",
|
|
187
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
188
|
+
use_instructions=True,
|
|
189
|
+
public_training_code=None,
|
|
190
|
+
public_training_data=None,
|
|
191
|
+
training_datasets=training_data,
|
|
192
|
+
citation=OCTEN_CITATION,
|
|
193
|
+
adapted_from="Qwen/Qwen3-Embedding-0.6B",
|
|
194
|
+
)
|
|
195
|
+
|
|
166
196
|
Octen_Embedding_4B = ModelMeta(
|
|
167
197
|
loader=InstructSentenceTransformerModel,
|
|
168
198
|
loader_kwargs=dict(
|