mteb 2.6.6__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. mteb/_create_dataloaders.py +7 -3
  2. mteb/_evaluators/any_sts_evaluator.py +6 -3
  3. mteb/_evaluators/clustering_evaluator.py +2 -2
  4. mteb/_evaluators/evaluator.py +2 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -5
  6. mteb/_evaluators/pair_classification_evaluator.py +2 -2
  7. mteb/_evaluators/retrieval_evaluator.py +2 -2
  8. mteb/_evaluators/sklearn_evaluator.py +3 -3
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +5 -3
  10. mteb/_evaluators/text/summarization_evaluator.py +3 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  12. mteb/abstasks/abstask.py +3 -2
  13. mteb/abstasks/aggregated_task.py +3 -3
  14. mteb/abstasks/classification.py +3 -3
  15. mteb/abstasks/clustering.py +2 -2
  16. mteb/abstasks/clustering_legacy.py +2 -2
  17. mteb/abstasks/image/image_text_pair_classification.py +2 -1
  18. mteb/abstasks/multilabel_classification.py +2 -2
  19. mteb/abstasks/pair_classification.py +2 -2
  20. mteb/abstasks/retrieval.py +15 -14
  21. mteb/abstasks/sts.py +2 -2
  22. mteb/abstasks/text/bitext_mining.py +3 -3
  23. mteb/abstasks/text/summarization.py +2 -2
  24. mteb/abstasks/zeroshot_classification.py +3 -2
  25. mteb/benchmarks/benchmarks/__init__.py +2 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +24 -0
  27. mteb/cli/build_cli.py +2 -1
  28. mteb/deprecated_evaluator.py +3 -3
  29. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  30. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  31. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  32. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  33. mteb/evaluate.py +5 -3
  34. mteb/models/abs_encoder.py +3 -1
  35. mteb/models/instruct_wrapper.py +1 -1
  36. mteb/models/model_implementations/bm25.py +3 -3
  37. mteb/models/model_implementations/jina_clip.py +46 -8
  38. mteb/models/model_implementations/mxbai_models.py +118 -1
  39. mteb/models/model_implementations/nvidia_models.py +73 -5
  40. mteb/models/model_implementations/octen_models.py +30 -0
  41. mteb/models/model_implementations/pylate_models.py +5 -4
  42. mteb/models/model_implementations/sentence_transformers_models.py +66 -0
  43. mteb/models/models_protocols.py +6 -4
  44. mteb/models/search_wrappers.py +7 -6
  45. mteb/models/sentence_transformer_wrapper.py +5 -4
  46. mteb/tasks/retrieval/kor/__init__.py +15 -1
  47. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  48. mteb/types/__init__.py +2 -0
  49. mteb/types/_encoder_io.py +12 -0
  50. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/METADATA +1 -1
  51. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/RECORD +55 -50
  52. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/WHEEL +0 -0
  53. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/entry_points.txt +0 -0
  54. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/licenses/LICENSE +0 -0
  55. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/top_level.txt +0 -0
@@ -2728,3 +2728,27 @@ JMTEB_LITE_V1 = Benchmark(
2728
2728
  """,
2729
2729
  contacts=["lsz05"],
2730
2730
  )
2731
+
2732
+ KOVIDORE_V2 = Benchmark(
2733
+ name="KoViDoRe(v2)",
2734
+ display_name="KoViDoRe v2",
2735
+ tasks=get_tasks(
2736
+ tasks=[
2737
+ "KoVidore2CybersecurityRetrieval",
2738
+ "KoVidore2EconomicRetrieval",
2739
+ "KoVidore2EnergyRetrieval",
2740
+ "KoVidore2HrRetrieval",
2741
+ ]
2742
+ ),
2743
+ description="KoViDoRe v2 sets a new industry gold standard for multi-modal, enterprise document visual retrieval evaluation. It addresses a critical challenge in production RAG systems: retrieving accurate information from complex, visually-rich documents.",
2744
+ reference="https://github.com/whybe-choi/kovidore-data-generator",
2745
+ citation=r"""
2746
+ @misc{choi2026kovidorev2,
2747
+ author = {Yongbin Choi},
2748
+ note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains},
2749
+ title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases},
2750
+ url = {https://github.com/whybe-choi/kovidore-data-generator},
2751
+ year = {2026},
2752
+ }
2753
+ """,
2754
+ )
mteb/cli/build_cli.py CHANGED
@@ -13,6 +13,7 @@ from mteb.cache import ResultCache
13
13
  from mteb.cli._display_tasks import _display_benchmarks, _display_tasks
14
14
  from mteb.cli.generate_model_card import generate_model_card
15
15
  from mteb.evaluate import OverwriteStrategy
16
+ from mteb.types._encoder_io import EncodeKwargs
16
17
 
17
18
  logger = logging.getLogger(__name__)
18
19
 
@@ -64,7 +65,7 @@ def run(args: argparse.Namespace) -> None:
64
65
  eval_splits=args.eval_splits,
65
66
  )
66
67
 
67
- encode_kwargs = {}
68
+ encode_kwargs: EncodeKwargs = {}
68
69
  if args.batch_size is not None:
69
70
  encode_kwargs["batch_size"] = args.batch_size
70
71
 
@@ -28,7 +28,7 @@ from mteb.models import (
28
28
  SentenceTransformerEncoderWrapper,
29
29
  )
30
30
  from mteb.results import TaskResult
31
- from mteb.types import ScoresDict
31
+ from mteb.types import EncodeKwargs, ScoresDict
32
32
 
33
33
  if sys.version_info >= (3, 13):
34
34
  from warnings import deprecated
@@ -174,7 +174,7 @@ class MTEB:
174
174
  split: str,
175
175
  subsets_to_run: list[str] | None = None,
176
176
  *,
177
- encode_kwargs: dict[str, Any],
177
+ encode_kwargs: EncodeKwargs,
178
178
  **kwargs: Any,
179
179
  ):
180
180
  tick = time()
@@ -263,7 +263,7 @@ class MTEB:
263
263
  overwrite_results: bool = False,
264
264
  raise_error: bool = True,
265
265
  co2_tracker: bool = False,
266
- encode_kwargs: dict[str, Any] | None = None,
266
+ encode_kwargs: EncodeKwargs | None = None,
267
267
  **kwargs,
268
268
  ) -> list[TaskResult]:
269
269
  """Run the evaluation pipeline on the selected tasks.
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1299,
4
+ "number_of_characters": 9254,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2245,
8
+ "average_image_width": 2370.324347826087,
9
+ "max_image_width": 3508,
10
+ "min_image_height": 2481,
11
+ "average_image_height": 3289.8060869565215,
12
+ "max_image_height": 3580,
13
+ "unique_images": 1132
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 9254,
17
+ "min_text_length": 15,
18
+ "average_text_length": 62.10738255033557,
19
+ "max_text_length": 108,
20
+ "unique_texts": 149
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 409,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 2.7449664429530203,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 316
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1640,
4
+ "number_of_characters": 8331,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2313,
8
+ "average_image_width": 2347.5321597833445,
9
+ "max_image_width": 2481,
10
+ "min_image_height": 3138,
11
+ "average_image_height": 3214.301963439404,
12
+ "max_image_height": 3508,
13
+ "unique_images": 1442
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 8331,
17
+ "min_text_length": 23,
18
+ "average_text_length": 51.11042944785276,
19
+ "max_text_length": 110,
20
+ "unique_texts": 163
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 413,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 2.5337423312883436,
27
+ "max_relevant_docs_per_query": 6,
28
+ "unique_relevant_docs": 349
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2166,
4
+ "number_of_characters": 9764,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 2221,
8
+ "average_image_width": 2339.4957350727545,
9
+ "max_image_width": 2480,
10
+ "min_image_height": 3036,
11
+ "average_image_height": 3242.8138484696437,
12
+ "max_image_height": 3508,
13
+ "unique_images": 1974
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 9764,
17
+ "min_text_length": 22,
18
+ "average_text_length": 56.4393063583815,
19
+ "max_text_length": 103,
20
+ "unique_texts": 173
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 525,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 3.0346820809248554,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 442
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2330,
4
+ "number_of_characters": 13131,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 1949,
8
+ "average_image_width": 2430.1152204836417,
9
+ "max_image_width": 3505,
10
+ "min_image_height": 2480,
11
+ "average_image_height": 3350.3921289710765,
12
+ "max_image_height": 3626,
13
+ "unique_images": 2096
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 13131,
17
+ "min_text_length": 21,
18
+ "average_text_length": 59.41628959276018,
19
+ "max_text_length": 112,
20
+ "unique_texts": 221
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 726,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 3.2850678733031673,
27
+ "max_relevant_docs_per_query": 7,
28
+ "unique_relevant_docs": 575
29
+ },
30
+ "top_ranked_statistics": null
31
+ }
32
+ }
mteb/evaluate.py CHANGED
@@ -5,7 +5,7 @@ import warnings
5
5
  from collections.abc import Iterable
6
6
  from pathlib import Path
7
7
  from time import time
8
- from typing import TYPE_CHECKING, Any, cast
8
+ from typing import TYPE_CHECKING, cast
9
9
 
10
10
  from datasets.exceptions import DatasetNotFoundError
11
11
  from tqdm.auto import tqdm
@@ -27,6 +27,7 @@ from mteb.models.sentence_transformer_wrapper import (
27
27
  from mteb.results import ModelResult, TaskResult
28
28
  from mteb.results.task_result import TaskError
29
29
  from mteb.types import HFSubset, PromptType, SplitName
30
+ from mteb.types._encoder_io import EncodeKwargs
30
31
  from mteb.types._metadata import ModelName, Revision
31
32
 
32
33
  if TYPE_CHECKING:
@@ -85,9 +86,10 @@ def _evaluate_task(
85
86
  *,
86
87
  splits: dict[SplitName, list[HFSubset]],
87
88
  co2_tracker: bool | None,
88
- encode_kwargs: dict[str, Any],
89
+ encode_kwargs: EncodeKwargs,
89
90
  prediction_folder: Path | None,
90
91
  public_only: bool | None,
92
+ num_proc: int = 1,
91
93
  ) -> TaskResult | TaskError:
92
94
  """The core logic to run a model on a given task. See `evaluate` for more details.
93
95
 
@@ -270,7 +272,7 @@ def evaluate(
270
272
  *,
271
273
  co2_tracker: bool | None = None,
272
274
  raise_error: bool = True,
273
- encode_kwargs: dict[str, Any] | None = None,
275
+ encode_kwargs: EncodeKwargs | None = None,
274
276
  cache: ResultCache | None = ResultCache(),
275
277
  overwrite_strategy: str | OverwriteStrategy = "only-missing",
276
278
  prediction_folder: Path | str | None = None,
@@ -5,6 +5,7 @@ from collections.abc import Callable, Sequence
5
5
  from typing import Any, Literal, cast, get_args, overload
6
6
 
7
7
  from torch.utils.data import DataLoader
8
+ from typing_extensions import Unpack
8
9
 
9
10
  import mteb
10
11
  from mteb.abstasks.task_metadata import TaskMetadata, TaskType
@@ -19,6 +20,7 @@ from mteb.similarity_functions import (
19
20
  from mteb.types import (
20
21
  Array,
21
22
  BatchedInput,
23
+ EncodeKwargs,
22
24
  PromptType,
23
25
  )
24
26
 
@@ -370,7 +372,7 @@ class AbsEncoder(ABC):
370
372
  hf_split: str,
371
373
  hf_subset: str,
372
374
  prompt_type: PromptType | None = None,
373
- **kwargs: Any,
375
+ **kwargs: Unpack[EncodeKwargs],
374
376
  ) -> Array:
375
377
  """Encodes the given sentences using the encoder.
376
378
 
@@ -92,7 +92,7 @@ def instruct_wrapper(
92
92
  logger.info(
93
93
  f"Using instruction: '{instruction}' for task: '{task_metadata.name}'"
94
94
  )
95
- embeddings = super().encode( # type: ignore[safe-super]
95
+ embeddings = super().encode( # type: ignore[safe-super,call-arg]
96
96
  _inputs, # type: ignore[arg-type]
97
97
  instruction=instruction,
98
98
  *args,
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Any
3
2
 
4
3
  from mteb._create_dataloaders import _create_text_queries_dataloader
5
4
  from mteb._requires_package import requires_package
@@ -8,6 +7,7 @@ from mteb.models.model_meta import ModelMeta
8
7
  from mteb.models.models_protocols import SearchProtocol
9
8
  from mteb.types import (
10
9
  CorpusDatasetType,
10
+ EncodeKwargs,
11
11
  InstructionDatasetType,
12
12
  QueryDatasetType,
13
13
  RetrievalOutputType,
@@ -49,7 +49,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
49
49
  task_metadata: TaskMetadata,
50
50
  hf_split: str,
51
51
  hf_subset: str,
52
- encode_kwargs: dict[str, Any],
52
+ encode_kwargs: EncodeKwargs,
53
53
  ) -> None:
54
54
  logger.info("Encoding Corpus...")
55
55
  corpus_texts = [
@@ -74,7 +74,7 @@ def bm25_loader(model_name, **kwargs) -> SearchProtocol:
74
74
  hf_split: str,
75
75
  hf_subset: str,
76
76
  top_k: int,
77
- encode_kwargs: dict[str, Any],
77
+ encode_kwargs: EncodeKwargs,
78
78
  instructions: InstructionDatasetType | None = None,
79
79
  top_ranked: TopRankedDocumentsType | None = None,
80
80
  ) -> RetrievalOutputType:
@@ -7,6 +7,7 @@ from tqdm.auto import tqdm
7
7
  from mteb._requires_package import requires_image_dependencies
8
8
  from mteb.abstasks.task_metadata import TaskMetadata
9
9
  from mteb.models.abs_encoder import AbsEncoder
10
+ from mteb.models.model_implementations.colpali_models import COLPALI_TRAINING_DATA
10
11
  from mteb.models.model_meta import ModelMeta, ScoringFunction
11
12
  from mteb.types import Array, BatchedInput, PromptType
12
13
 
@@ -120,6 +121,15 @@ class JinaCLIPModel(AbsEncoder):
120
121
  raise ValueError
121
122
 
122
123
 
124
+ _JINA_CLIP_TRAIN_DATASETS_V1 = {
125
+ # LAION400M
126
+ # ShareGPT4V
127
+ "MSMARCO",
128
+ "NQ",
129
+ "HotpotQA",
130
+ # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
131
+ }
132
+
123
133
  jina_clip_v1 = ModelMeta(
124
134
  loader=JinaCLIPModel,
125
135
  name="jinaai/jina-clip-v1",
@@ -140,13 +150,41 @@ jina_clip_v1 = ModelMeta(
140
150
  reference="https://huggingface.co/jinaai/jina-clip-v1",
141
151
  similarity_fn_name=ScoringFunction.COSINE,
142
152
  use_instructions=True,
143
- training_datasets={
144
- # LAION400M
145
- # ShareGPT4V
146
- "MSMARCO",
147
- # NQ
148
- # HotpotQA
149
- # Natural Language Inference (NLI) dataset (Bowman et al., 2015)
150
- },
153
+ training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1,
151
154
  citation=JINA_CLIP_CITATION,
155
+ superseded_by="jinaai/jina-clip-v2",
156
+ )
157
+
158
+ jina_clip_v2 = ModelMeta(
159
+ loader=JinaCLIPModel,
160
+ name="jinaai/jina-clip-v2",
161
+ revision="344d954da76eb8ad47a7aaff42d012e30c15b8fe",
162
+ release_date="2024-10-09",
163
+ languages=["eng-Latn"],
164
+ n_parameters=865278477,
165
+ memory_usage_mb=1650.0,
166
+ max_tokens=8192,
167
+ embed_dim=1024,
168
+ license="cc-by-nc-4.0",
169
+ open_weights=True,
170
+ public_training_code=None,
171
+ public_training_data=None,
172
+ framework=["PyTorch", "Sentence Transformers"],
173
+ reference="https://huggingface.co/jinaai/jina-clip-v2",
174
+ similarity_fn_name=ScoringFunction.COSINE,
175
+ use_instructions=False,
176
+ training_datasets=_JINA_CLIP_TRAIN_DATASETS_V1 | COLPALI_TRAINING_DATA,
177
+ modalities=["text", "image"],
178
+ model_type=["dense"],
179
+ citation="""
180
+ @misc{koukounas2024jinaclipv2multilingualmultimodalembeddings,
181
+ title={jina-clip-v2: Multilingual Multimodal Embeddings for Text and Images},
182
+ author={Andreas Koukounas and Georgios Mastrapas and Bo Wang and Mohammad Kalim Akram and Sedigheh Eslami and Michael Günther and Isabelle Mohr and Saba Sturua and Scott Martens and Nan Wang and Han Xiao},
183
+ year={2024},
184
+ eprint={2412.08802},
185
+ archivePrefix={arXiv},
186
+ primaryClass={cs.CL},
187
+ url={https://arxiv.org/abs/2412.08802},
188
+ }
189
+ """,
152
190
  )
@@ -2,7 +2,10 @@ from mteb.models.model_meta import (
2
2
  ModelMeta,
3
3
  ScoringFunction,
4
4
  )
5
- from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
5
+ from mteb.models.sentence_transformer_wrapper import (
6
+ CrossEncoderWrapper,
7
+ sentence_transformers_loader,
8
+ )
6
9
 
7
10
  mixedbread_training_data = {
8
11
  # from correspondence:
@@ -122,3 +125,117 @@ mxbai_embed_xsmall_v1 = ModelMeta(
122
125
  url={https://www.mixedbread.ai/blog/mxbai-embed-xsmall-v1},
123
126
  }""",
124
127
  )
128
+
129
+ mxbai_rerank_xsmall_v1 = ModelMeta(
130
+ loader=CrossEncoderWrapper,
131
+ name="mixedbread-ai/mxbai-rerank-xsmall-v1",
132
+ revision="b5c6e9da73abc3711f593f705371cdbe9e0fe422",
133
+ release_date="2024-02-29",
134
+ languages=["eng-Latn"],
135
+ n_parameters=70830337,
136
+ memory_usage_mb=135.0,
137
+ max_tokens=512,
138
+ embed_dim=None,
139
+ license="apache-2.0",
140
+ open_weights=True,
141
+ public_training_code=None,
142
+ public_training_data=None,
143
+ framework=[
144
+ "PyTorch",
145
+ "Sentence Transformers",
146
+ "Transformers",
147
+ "ONNX",
148
+ "safetensors",
149
+ ],
150
+ reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-xsmall-v1",
151
+ similarity_fn_name=None,
152
+ use_instructions=None,
153
+ training_datasets=None,
154
+ adapted_from=None,
155
+ superseded_by=None,
156
+ modalities=["text"],
157
+ model_type=["cross-encoder"],
158
+ citation="""@online{rerank2024mxbai,
159
+ title={Boost Your Search With The Crispy Mixedbread Rerank Models},
160
+ author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
161
+ year={2024},
162
+ url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
163
+ }""",
164
+ contacts=None,
165
+ )
166
+
167
+ mxbai_rerank_base_v1 = ModelMeta(
168
+ loader=CrossEncoderWrapper,
169
+ name="mixedbread-ai/mxbai-rerank-base-v1",
170
+ revision="800f24c113213a187e65bde9db00c15a2bb12738",
171
+ release_date="2024-02-29",
172
+ languages=["eng-Latn"],
173
+ n_parameters=184422913,
174
+ memory_usage_mb=352.0,
175
+ max_tokens=512,
176
+ embed_dim=None,
177
+ license="apache-2.0",
178
+ open_weights=True,
179
+ public_training_code=None,
180
+ public_training_data=None,
181
+ framework=[
182
+ "PyTorch",
183
+ "Sentence Transformers",
184
+ "Transformers",
185
+ "ONNX",
186
+ "safetensors",
187
+ ],
188
+ reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1",
189
+ similarity_fn_name=None,
190
+ use_instructions=None,
191
+ training_datasets=None,
192
+ adapted_from=None,
193
+ superseded_by=None,
194
+ modalities=["text"],
195
+ model_type=["cross-encoder"],
196
+ citation="""@online{rerank2024mxbai,
197
+ title={Boost Your Search With The Crispy Mixedbread Rerank Models},
198
+ author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
199
+ year={2024},
200
+ url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
201
+ }""",
202
+ contacts=None,
203
+ )
204
+
205
+ mxbai_rerank_large_v1 = ModelMeta(
206
+ loader=CrossEncoderWrapper,
207
+ name="mixedbread-ai/mxbai-rerank-large-v1",
208
+ revision="98f655841d5caf0b16eaff79c2b4ca109d920d17",
209
+ release_date="2024-02-29",
210
+ languages=["eng-Latn"],
211
+ n_parameters=435062785,
212
+ memory_usage_mb=830.0,
213
+ max_tokens=512,
214
+ embed_dim=None,
215
+ license="apache-2.0",
216
+ open_weights=True,
217
+ public_training_code=None,
218
+ public_training_data=None,
219
+ framework=[
220
+ "PyTorch",
221
+ "Sentence Transformers",
222
+ "Transformers",
223
+ "ONNX",
224
+ "safetensors",
225
+ ],
226
+ reference="https://huggingface.co/mixedbread-ai/mxbai-rerank-large-v1",
227
+ similarity_fn_name=None,
228
+ use_instructions=None,
229
+ training_datasets=None,
230
+ adapted_from=None,
231
+ superseded_by=None,
232
+ modalities=["text"],
233
+ model_type=["cross-encoder"],
234
+ citation="""@online{rerank2024mxbai,
235
+ title={Boost Your Search With The Crispy Mixedbread Rerank Models},
236
+ author={Aamir Shakir and Darius Koenig and Julius Lipp and Sean Lee},
237
+ year={2024},
238
+ url={https://www.mixedbread.ai/blog/mxbai-rerank-v1},
239
+ }""",
240
+ contacts=None,
241
+ )
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from collections.abc import Callable
2
3
  from typing import Any
3
4
 
4
5
  import torch
@@ -29,7 +30,7 @@ NV_RETRIEVER_CITATION = """@misc{moreira2025nvretrieverimprovingtextembedding,
29
30
  }"""
30
31
 
31
32
 
32
- def instruction_template(
33
+ def _instruction_template(
33
34
  instruction: str, prompt_type: PromptType | None = None
34
35
  ) -> str:
35
36
  return f"Instruct: {instruction}\nQuery: " if instruction else ""
@@ -100,10 +101,77 @@ nvidia_training_datasets = {
100
101
  "MrTidyRetrieval",
101
102
  }
102
103
 
104
+
105
+ class _NVEmbedWrapper(InstructSentenceTransformerModel):
106
+ """Inherited, because nvembed requires `sbert==2`, but it doesn't have tokenizers kwargs"""
107
+
108
+ def __init__(
109
+ self,
110
+ model_name: str,
111
+ revision: str,
112
+ instruction_template: str
113
+ | Callable[[str, PromptType | None], str]
114
+ | None = None,
115
+ max_seq_length: int | None = None,
116
+ apply_instruction_to_passages: bool = True,
117
+ padding_side: str | None = None,
118
+ add_eos_token: bool = False,
119
+ prompts_dict: dict[str, str] | None = None,
120
+ **kwargs: Any,
121
+ ):
122
+ from sentence_transformers import __version__ as sbert_version
123
+
124
+ required_transformers_version = "4.42.4"
125
+ required_sbert_version = "2.7.0"
126
+
127
+ if Version(transformers_version) != Version(required_transformers_version):
128
+ raise RuntimeError(
129
+ f"transformers version {transformers_version} is not match with required "
130
+ f"install version {required_transformers_version} to run `nvidia/NV-Embed-v2`"
131
+ )
132
+
133
+ if Version(sbert_version) != Version(required_sbert_version):
134
+ raise RuntimeError(
135
+ f"sbert version {sbert_version} is not match with required "
136
+ f"install version {required_sbert_version} to run `nvidia/NV-Embed-v2`"
137
+ )
138
+
139
+ requires_package(
140
+ self, "flash_attn", model_name, "pip install 'mteb[flash_attention]'"
141
+ )
142
+
143
+ from sentence_transformers import SentenceTransformer
144
+
145
+ if (
146
+ isinstance(instruction_template, str)
147
+ and "{instruction}" not in instruction_template
148
+ ):
149
+ raise ValueError(
150
+ "Instruction template must contain the string '{instruction}'."
151
+ )
152
+ if instruction_template is None:
153
+ logger.warning(
154
+ "No instruction template provided. Instructions will be used as-is."
155
+ )
156
+
157
+ self.instruction_template = instruction_template
158
+
159
+ self.model_name = model_name
160
+ self.model = SentenceTransformer(model_name, revision=revision, **kwargs)
161
+ self.model.tokenizer.padding_side = padding_side
162
+ self.model.tokenizer.add_eos_token = add_eos_token
163
+
164
+ if max_seq_length:
165
+ # https://github.com/huggingface/sentence-transformers/issues/3575
166
+ self.model.max_seq_length = max_seq_length
167
+ self.apply_instruction_to_passages = apply_instruction_to_passages
168
+ self.prompts_dict = prompts_dict
169
+
170
+
103
171
  NV_embed_v2 = ModelMeta(
104
- loader=InstructSentenceTransformerModel,
172
+ loader=_NVEmbedWrapper,
105
173
  loader_kwargs=dict(
106
- instruction_template=instruction_template,
174
+ instruction_template=_instruction_template,
107
175
  trust_remote_code=True,
108
176
  max_seq_length=32768,
109
177
  padding_side="right",
@@ -132,9 +200,9 @@ NV_embed_v2 = ModelMeta(
132
200
  )
133
201
 
134
202
  NV_embed_v1 = ModelMeta(
135
- loader=InstructSentenceTransformerModel,
203
+ loader=_NVEmbedWrapper,
136
204
  loader_kwargs=dict(
137
- instruction_template=instruction_template,
205
+ instruction_template=_instruction_template,
138
206
  trust_remote_code=True,
139
207
  max_seq_length=32768,
140
208
  padding_side="right",
@@ -163,6 +163,36 @@ _PREDEFINED_PROMPTS = {
163
163
  "German1Retrieval": "Given a query, retrieve relevant passages",
164
164
  }
165
165
 
166
+ Octen_Embedding_0B6 = ModelMeta(
167
+ loader=InstructSentenceTransformerModel,
168
+ loader_kwargs=dict(
169
+ instruction_template=instruction_template,
170
+ apply_instruction_to_passages=True,
171
+ prompts_dict=_PREDEFINED_PROMPTS,
172
+ max_seq_length=18480,
173
+ model_kwargs={"torch_dtype": "bfloat16"},
174
+ ),
175
+ name="bflhc/Octen-Embedding-0.6B",
176
+ languages=multilingual_langs,
177
+ open_weights=True,
178
+ revision="1a00a4e837bd788f6f8d91bc43201a5e52cf8ef8",
179
+ release_date="2026-01-10",
180
+ n_parameters=595776512,
181
+ memory_usage_mb=1136,
182
+ embed_dim=1024,
183
+ max_tokens=32768,
184
+ license="apache-2.0",
185
+ reference="https://huggingface.co/bflhc/Octen-Embedding-0.6B",
186
+ similarity_fn_name="cosine",
187
+ framework=["Sentence Transformers", "PyTorch", "safetensors"],
188
+ use_instructions=True,
189
+ public_training_code=None,
190
+ public_training_data=None,
191
+ training_datasets=training_data,
192
+ citation=OCTEN_CITATION,
193
+ adapted_from="Qwen/Qwen3-Embedding-0.6B",
194
+ )
195
+
166
196
  Octen_Embedding_4B = ModelMeta(
167
197
  loader=InstructSentenceTransformerModel,
168
198
  loader_kwargs=dict(