mteb 2.6.6__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. mteb/_create_dataloaders.py +7 -3
  2. mteb/_evaluators/any_sts_evaluator.py +6 -3
  3. mteb/_evaluators/clustering_evaluator.py +2 -2
  4. mteb/_evaluators/evaluator.py +2 -1
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +8 -5
  6. mteb/_evaluators/pair_classification_evaluator.py +2 -2
  7. mteb/_evaluators/retrieval_evaluator.py +2 -2
  8. mteb/_evaluators/sklearn_evaluator.py +3 -3
  9. mteb/_evaluators/text/bitext_mining_evaluator.py +5 -3
  10. mteb/_evaluators/text/summarization_evaluator.py +3 -2
  11. mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
  12. mteb/abstasks/abstask.py +3 -2
  13. mteb/abstasks/aggregated_task.py +3 -3
  14. mteb/abstasks/classification.py +3 -3
  15. mteb/abstasks/clustering.py +2 -2
  16. mteb/abstasks/clustering_legacy.py +2 -2
  17. mteb/abstasks/image/image_text_pair_classification.py +2 -1
  18. mteb/abstasks/multilabel_classification.py +2 -2
  19. mteb/abstasks/pair_classification.py +2 -2
  20. mteb/abstasks/retrieval.py +15 -14
  21. mteb/abstasks/sts.py +2 -2
  22. mteb/abstasks/text/bitext_mining.py +3 -3
  23. mteb/abstasks/text/summarization.py +2 -2
  24. mteb/abstasks/zeroshot_classification.py +3 -2
  25. mteb/benchmarks/benchmarks/__init__.py +2 -0
  26. mteb/benchmarks/benchmarks/benchmarks.py +24 -0
  27. mteb/cli/build_cli.py +2 -1
  28. mteb/deprecated_evaluator.py +3 -3
  29. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
  30. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
  31. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
  32. mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
  33. mteb/evaluate.py +5 -3
  34. mteb/models/abs_encoder.py +3 -1
  35. mteb/models/instruct_wrapper.py +1 -1
  36. mteb/models/model_implementations/bm25.py +3 -3
  37. mteb/models/model_implementations/jina_clip.py +46 -8
  38. mteb/models/model_implementations/mxbai_models.py +118 -1
  39. mteb/models/model_implementations/nvidia_models.py +73 -5
  40. mteb/models/model_implementations/octen_models.py +30 -0
  41. mteb/models/model_implementations/pylate_models.py +5 -4
  42. mteb/models/model_implementations/sentence_transformers_models.py +66 -0
  43. mteb/models/models_protocols.py +6 -4
  44. mteb/models/search_wrappers.py +7 -6
  45. mteb/models/sentence_transformer_wrapper.py +5 -4
  46. mteb/tasks/retrieval/kor/__init__.py +15 -1
  47. mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
  48. mteb/types/__init__.py +2 -0
  49. mteb/types/_encoder_io.py +12 -0
  50. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/METADATA +1 -1
  51. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/RECORD +55 -50
  52. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/WHEEL +0 -0
  53. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/entry_points.txt +0 -0
  54. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/licenses/LICENSE +0 -0
  55. {mteb-2.6.6.dist-info → mteb-2.6.8.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ from mteb.types import (
19
19
  Array,
20
20
  BatchedInput,
21
21
  CorpusDatasetType,
22
+ EncodeKwargs,
22
23
  PromptType,
23
24
  QueryDatasetType,
24
25
  RetrievalOutputType,
@@ -45,7 +46,7 @@ class PylateSearchEncoder:
45
46
  task_metadata: TaskMetadata,
46
47
  hf_split: str,
47
48
  hf_subset: str,
48
- encode_kwargs: dict[str, Any],
49
+ encode_kwargs: EncodeKwargs,
49
50
  ) -> None:
50
51
  """Index the corpus for retrieval.
51
52
 
@@ -78,7 +79,7 @@ class PylateSearchEncoder:
78
79
  hf_split: str,
79
80
  hf_subset: str,
80
81
  top_k: int,
81
- encode_kwargs: dict[str, Any],
82
+ encode_kwargs: EncodeKwargs,
82
83
  top_ranked: TopRankedDocumentsType | None = None,
83
84
  ) -> RetrievalOutputType:
84
85
  queries_dataloader = create_dataloader(
@@ -136,7 +137,7 @@ class PylateSearchEncoder:
136
137
  hf_subset: str,
137
138
  hf_split: str,
138
139
  top_k: int,
139
- encode_kwargs: dict[str, Any],
140
+ encode_kwargs: EncodeKwargs,
140
141
  ) -> dict[str, list[tuple[float, str]]]:
141
142
  from pylate import indexes, retrieve
142
143
 
@@ -200,7 +201,7 @@ class PylateSearchEncoder:
200
201
  task_metadata: TaskMetadata,
201
202
  hf_subset: str,
202
203
  hf_split: str,
203
- encode_kwargs: dict[str, Any],
204
+ encode_kwargs: EncodeKwargs,
204
205
  ) -> dict[str, list[tuple[float, str]]]:
205
206
  """Rerank with PyLate's rank.rerank using per-query candidates.
206
207
 
@@ -1,5 +1,7 @@
1
1
  """Implementation of Sentence Transformers model validated in MTEB."""
2
2
 
3
+ import numpy as np
4
+
3
5
  from mteb.models.model_meta import ModelMeta, ScoringFunction
4
6
  from mteb.models.sentence_transformer_wrapper import (
5
7
  SentenceTransformerEncoderWrapper,
@@ -773,3 +775,67 @@ gtr_t5_base = ModelMeta(
773
775
  },
774
776
  citation=GTR_CITATION,
775
777
  )
778
+
779
+ static_retrieval_mrl_en_v1 = ModelMeta(
780
+ loader=sentence_transformers_loader,
781
+ name="sentence-transformers/static-retrieval-mrl-en-v1",
782
+ revision="f60985c706f192d45d218078e49e5a8b6f15283a",
783
+ release_date="2024-10-24",
784
+ languages=["eng-Latn"],
785
+ n_parameters=3_125_4528,
786
+ memory_usage_mb=119,
787
+ max_tokens=np.inf,
788
+ embed_dim=1024,
789
+ license="apache-2.0",
790
+ open_weights=True,
791
+ public_training_code="https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1/blob/main/train.py",
792
+ public_training_data=None,
793
+ framework=["PyTorch", "Sentence Transformers"],
794
+ reference="https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1",
795
+ similarity_fn_name=ScoringFunction.COSINE,
796
+ use_instructions=False,
797
+ training_datasets={
798
+ "MSMARCO",
799
+ # gooaq
800
+ # s2orc
801
+ # allnli
802
+ # paq
803
+ # trivia-qa
804
+ # swim-ir-monolingual
805
+ # PubMedQA
806
+ # swim
807
+ "MIRACLRetrieval",
808
+ "MultiLongDocRetrieval",
809
+ "MrTidyRetrieval",
810
+ },
811
+ modalities=["text"],
812
+ model_type=["dense"],
813
+ )
814
+
815
+ multi_qa_mpnet_base_dot_v1 = ModelMeta(
816
+ loader=sentence_transformers_loader,
817
+ name="sentence-transformers/multi-qa-mpnet-base-dot-v1",
818
+ revision="3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f",
819
+ release_date="2021-08-23",
820
+ languages=["eng-Latn"],
821
+ n_parameters=109486978,
822
+ memory_usage_mb=418.0,
823
+ max_tokens=512,
824
+ embed_dim=768,
825
+ license=None,
826
+ open_weights=True,
827
+ public_training_code="https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/train_script.py",
828
+ public_training_data=None,
829
+ framework=["PyTorch", "Sentence Transformers"],
830
+ reference="https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1",
831
+ similarity_fn_name=ScoringFunction.DOT_PRODUCT,
832
+ use_instructions=False,
833
+ training_datasets={
834
+ "MSMARCO",
835
+ "YahooAnswersTopicsClassification",
836
+ "NQ",
837
+ },
838
+ adapted_from="microsoft/mpnet-base",
839
+ modalities=["text"],
840
+ model_type=["dense"],
841
+ )
@@ -1,12 +1,14 @@
1
1
  from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
2
2
 
3
3
  from torch.utils.data import DataLoader
4
+ from typing_extensions import Unpack
4
5
 
5
6
  from mteb.abstasks.task_metadata import TaskMetadata
6
7
  from mteb.types import (
7
8
  Array,
8
9
  BatchedInput,
9
10
  CorpusDatasetType,
11
+ EncodeKwargs,
10
12
  PromptType,
11
13
  QueryDatasetType,
12
14
  RetrievalOutputType,
@@ -28,7 +30,7 @@ class SearchProtocol(Protocol):
28
30
  task_metadata: TaskMetadata,
29
31
  hf_split: str,
30
32
  hf_subset: str,
31
- encode_kwargs: dict[str, Any],
33
+ encode_kwargs: EncodeKwargs,
32
34
  ) -> None:
33
35
  """Index the corpus for retrieval.
34
36
 
@@ -49,7 +51,7 @@ class SearchProtocol(Protocol):
49
51
  hf_split: str,
50
52
  hf_subset: str,
51
53
  top_k: int,
52
- encode_kwargs: dict[str, Any],
54
+ encode_kwargs: EncodeKwargs,
53
55
  top_ranked: TopRankedDocumentsType | None = None,
54
56
  ) -> RetrievalOutputType:
55
57
  """Search the corpus using the given queries.
@@ -108,7 +110,7 @@ class EncoderProtocol(Protocol):
108
110
  hf_split: str,
109
111
  hf_subset: str,
110
112
  prompt_type: PromptType | None = None,
111
- **kwargs: Any,
113
+ **kwargs: Unpack[EncodeKwargs],
112
114
  ) -> Array:
113
115
  """Encodes the given sentences using the encoder.
114
116
 
@@ -214,7 +216,7 @@ class CrossEncoderProtocol(Protocol):
214
216
  hf_split: str,
215
217
  hf_subset: str,
216
218
  prompt_type: PromptType | None = None,
217
- **kwargs: Any,
219
+ **kwargs: Unpack[EncodeKwargs],
218
220
  ) -> Array:
219
221
  """Predicts relevance scores for pairs of inputs. Note that, unlike the encoder, the cross-encoder can compare across inputs.
220
222
 
@@ -14,6 +14,7 @@ from mteb.types import (
14
14
  Array,
15
15
  BatchedInput,
16
16
  CorpusDatasetType,
17
+ EncodeKwargs,
17
18
  PromptType,
18
19
  QueryDatasetType,
19
20
  RetrievalOutputType,
@@ -50,7 +51,7 @@ class SearchEncoderWrapper:
50
51
  task_metadata: TaskMetadata,
51
52
  hf_split: str,
52
53
  hf_subset: str,
53
- encode_kwargs: dict[str, Any],
54
+ encode_kwargs: EncodeKwargs,
54
55
  ) -> None:
55
56
  """Index the corpus for retrieval.
56
57
 
@@ -88,7 +89,7 @@ class SearchEncoderWrapper:
88
89
  hf_split: str,
89
90
  hf_subset: str,
90
91
  top_k: int,
91
- encode_kwargs: dict[str, Any],
92
+ encode_kwargs: EncodeKwargs,
92
93
  top_ranked: TopRankedDocumentsType | None = None,
93
94
  ) -> RetrievalOutputType:
94
95
  """Search the corpus for the given queries.
@@ -215,7 +216,7 @@ class SearchEncoderWrapper:
215
216
  hf_subset: str,
216
217
  hf_split: str,
217
218
  top_k: int,
218
- encode_kwargs: dict[str, Any],
219
+ encode_kwargs: EncodeKwargs,
219
220
  ) -> dict[str, list[tuple[float, str]]]:
220
221
  logger.info("Encoding Corpus in batches (this might take a while)...")
221
222
  if self.task_corpus is None:
@@ -318,7 +319,7 @@ class SearchEncoderWrapper:
318
319
  task_metadata: TaskMetadata,
319
320
  hf_subset: str,
320
321
  hf_split: str,
321
- encode_kwargs: dict[str, Any],
322
+ encode_kwargs: EncodeKwargs,
322
323
  ) -> dict[str, list[tuple[float, str]]]:
323
324
  """Rerank documents based on pre-ranked documents.
324
325
 
@@ -470,7 +471,7 @@ class SearchCrossEncoderWrapper:
470
471
  task_metadata: TaskMetadata,
471
472
  hf_split: str,
472
473
  hf_subset: str,
473
- encode_kwargs: dict[str, Any],
474
+ encode_kwargs: EncodeKwargs,
474
475
  ) -> None:
475
476
  """Index the corpus for retrieval.
476
477
 
@@ -491,7 +492,7 @@ class SearchCrossEncoderWrapper:
491
492
  hf_split: str,
492
493
  hf_subset: str,
493
494
  top_k: int,
494
- encode_kwargs: dict[str, Any],
495
+ encode_kwargs: EncodeKwargs,
495
496
  top_ranked: TopRankedDocumentsType | None = None,
496
497
  ) -> RetrievalOutputType:
497
498
  """Search the corpus using the given queries.
@@ -8,10 +8,11 @@ import numpy as np
8
8
  import torch
9
9
  from packaging.version import Version
10
10
  from torch.utils.data import DataLoader
11
+ from typing_extensions import Unpack
11
12
 
12
13
  from mteb._log_once import LogOnce
13
14
  from mteb.models import ModelMeta
14
- from mteb.types import Array, BatchedInput, PromptType
15
+ from mteb.types import Array, BatchedInput, EncodeKwargs, PromptType
15
16
 
16
17
  from .abs_encoder import AbsEncoder
17
18
 
@@ -122,7 +123,7 @@ class SentenceTransformerEncoderWrapper(AbsEncoder):
122
123
  hf_split: str,
123
124
  hf_subset: str,
124
125
  prompt_type: PromptType | None = None,
125
- **kwargs: Any,
126
+ **kwargs: Unpack[EncodeKwargs],
126
127
  ) -> Array:
127
128
  """Encodes the given sentences using the encoder.
128
129
 
@@ -201,7 +202,7 @@ class SentenceTransformerMultimodalEncoderWrapper(SentenceTransformerEncoderWrap
201
202
  hf_split: str,
202
203
  hf_subset: str,
203
204
  prompt_type: PromptType | None = None,
204
- **kwargs: Any,
205
+ **kwargs: Unpack[EncodeKwargs],
205
206
  ) -> Array:
206
207
  """Encodes the given sentences using the encoder.
207
208
 
@@ -292,7 +293,7 @@ class CrossEncoderWrapper:
292
293
  hf_split: str,
293
294
  hf_subset: str,
294
295
  prompt_type: PromptType | None = None,
295
- **kwargs: Any,
296
+ **kwargs: Unpack[EncodeKwargs],
296
297
  ) -> Array:
297
298
  """Predicts relevance scores for pairs of inputs. Note that, unlike the encoder, the cross-encoder can compare across inputs.
298
299
 
@@ -1,5 +1,19 @@
1
1
  from .auto_rag_retrieval import AutoRAGRetrieval
2
2
  from .ko_strategy_qa import KoStrategyQA
3
+ from .kovidore2_bench_retrieval import (
4
+ KoVidore2CybersecurityRetrieval,
5
+ KoVidore2EconomicRetrieval,
6
+ KoVidore2EnergyRetrieval,
7
+ KoVidore2HrRetrieval,
8
+ )
3
9
  from .squad_kor_v1_retrieval import SQuADKorV1Retrieval
4
10
 
5
- __all__ = ["AutoRAGRetrieval", "KoStrategyQA", "SQuADKorV1Retrieval"]
11
+ __all__ = [
12
+ "AutoRAGRetrieval",
13
+ "KoStrategyQA",
14
+ "KoVidore2CybersecurityRetrieval",
15
+ "KoVidore2EconomicRetrieval",
16
+ "KoVidore2EnergyRetrieval",
17
+ "KoVidore2HrRetrieval",
18
+ "SQuADKorV1Retrieval",
19
+ ]
@@ -0,0 +1,142 @@
1
+ from mteb.abstasks.retrieval import AbsTaskRetrieval
2
+ from mteb.abstasks.task_metadata import TaskMetadata
3
+
4
+
5
+ class KoVidore2CybersecurityRetrieval(AbsTaskRetrieval):
6
+ metadata = TaskMetadata(
7
+ name="KoVidore2CybersecurityRetrieval",
8
+ description="Retrieve associated pages according to questions. This dataset, Cybersecurity, is a corpus of technical reports on cyber threat trends and security incident responses in Korea, intended for complex-document understanding tasks.",
9
+ reference="https://github.com/whybe-choi/kovidore-data-generator",
10
+ dataset={
11
+ "path": "whybe-choi/kovidore-v2-cybersecurity-mteb",
12
+ "revision": "577d7c45f79d8eb4e7584db3990f91daa7e47956",
13
+ },
14
+ type="DocumentUnderstanding",
15
+ category="t2i",
16
+ eval_splits=["test"],
17
+ eval_langs=["kor-Hang"],
18
+ main_score="ndcg_at_10",
19
+ date=("2025-12-21", "2026-01-06"),
20
+ domains=["Social"],
21
+ task_subtypes=["Image Text Retrieval"],
22
+ license="cc-by-4.0",
23
+ annotations_creators="derived",
24
+ dialect=[],
25
+ modalities=["text", "image"],
26
+ sample_creation="created",
27
+ bibtex_citation="""
28
+ @misc{choi2026kovidorev2,
29
+ author = {Yongbin Choi},
30
+ note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains},
31
+ title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases},
32
+ url = {https://github.com/whybe-choi/kovidore-data-generator},
33
+ year = {2026},
34
+ }
35
+ """,
36
+ prompt={"query": "Find a screenshot that is relevant to the user's question."},
37
+ )
38
+
39
+
40
+ class KoVidore2EconomicRetrieval(AbsTaskRetrieval):
41
+ metadata = TaskMetadata(
42
+ name="KoVidore2EconomicRetrieval",
43
+ description="Retrieve associated pages according to questions. This dataset, Economic trends, is a corpus of periodic reports on major economic indicators in Korea, intended for complex-document understanding tasks.",
44
+ reference="https://github.com/whybe-choi/kovidore-data-generator",
45
+ dataset={
46
+ "path": "whybe-choi/kovidore-v2-economic-mteb",
47
+ "revision": "0189c26211290a902cd9d41a0db932808a54c0a8",
48
+ },
49
+ type="DocumentUnderstanding",
50
+ category="t2i",
51
+ eval_splits=["test"],
52
+ eval_langs=["kor-Hang"],
53
+ main_score="ndcg_at_10",
54
+ date=("2025-12-21", "2026-01-06"),
55
+ domains=["Social"],
56
+ task_subtypes=["Image Text Retrieval"],
57
+ license="cc-by-4.0",
58
+ annotations_creators="derived",
59
+ dialect=[],
60
+ modalities=["text", "image"],
61
+ sample_creation="created",
62
+ bibtex_citation="""
63
+ @misc{choi2026kovidorev2,
64
+ author = {Yongbin Choi},
65
+ note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains},
66
+ title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases},
67
+ url = {https://github.com/whybe-choi/kovidore-data-generator},
68
+ year = {2026},
69
+ }
70
+ """,
71
+ prompt={"query": "Find a screenshot that is relevant to the user's question."},
72
+ )
73
+
74
+
75
+ class KoVidore2EnergyRetrieval(AbsTaskRetrieval):
76
+ metadata = TaskMetadata(
77
+ name="KoVidore2EnergyRetrieval",
78
+ description="Retrieve associated pages according to questions. This dataset, Energy, is a corpus of reports on energy market trends, policy planning, and industry statistics, intended for complex-document understanding tasks.",
79
+ reference="https://github.com/whybe-choi/kovidore-data-generator",
80
+ dataset={
81
+ "path": "whybe-choi/kovidore-v2-energy-mteb",
82
+ "revision": "8c09a3d22b1fa3a7f5e815e9521da9b048754211",
83
+ },
84
+ type="DocumentUnderstanding",
85
+ category="t2i",
86
+ eval_splits=["test"],
87
+ eval_langs=["kor-Hang"],
88
+ main_score="ndcg_at_10",
89
+ date=("2025-12-21", "2026-01-06"),
90
+ domains=["Social"],
91
+ task_subtypes=["Image Text Retrieval"],
92
+ license="cc-by-4.0",
93
+ annotations_creators="derived",
94
+ dialect=[],
95
+ modalities=["text", "image"],
96
+ sample_creation="created",
97
+ bibtex_citation="""
98
+ @misc{choi2026kovidorev2,
99
+ author = {Yongbin Choi},
100
+ note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains},
101
+ title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases},
102
+ url = {https://github.com/whybe-choi/kovidore-data-generator},
103
+ year = {2026},
104
+ }
105
+ """,
106
+ prompt={"query": "Find a screenshot that is relevant to the user's question."},
107
+ )
108
+
109
+
110
+ class KoVidore2HrRetrieval(AbsTaskRetrieval):
111
+ metadata = TaskMetadata(
112
+ name="KoVidore2HrRetrieval",
113
+ description="Retrieve associated pages according to questions. This dataset, HR, is a corpus of reports on workforce outlook and employment policy in korea, intended for complex-document understanding tasks.",
114
+ reference="https://github.com/whybe-choi/kovidore-data-generator",
115
+ dataset={
116
+ "path": "whybe-choi/kovidore-v2-hr-mteb",
117
+ "revision": "d9432c782a9a3e2eed064f6fac08b4c967d92b99",
118
+ },
119
+ type="DocumentUnderstanding",
120
+ category="t2i",
121
+ eval_splits=["test"],
122
+ eval_langs=["kor-Hang"],
123
+ main_score="ndcg_at_10",
124
+ date=("2025-12-21", "2026-01-06"),
125
+ domains=["Social"],
126
+ task_subtypes=["Image Text Retrieval"],
127
+ license="cc-by-4.0",
128
+ annotations_creators="derived",
129
+ dialect=[],
130
+ modalities=["text", "image"],
131
+ sample_creation="created",
132
+ bibtex_citation="""
133
+ @misc{choi2026kovidorev2,
134
+ author = {Yongbin Choi},
135
+ note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains},
136
+ title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases},
137
+ url = {https://github.com/whybe-choi/kovidore-data-generator},
138
+ year = {2026},
139
+ }
140
+ """,
141
+ prompt={"query": "Find a screenshot that is relevant to the user's question."},
142
+ )
mteb/types/__init__.py CHANGED
@@ -4,6 +4,7 @@ from ._encoder_io import (
4
4
  Conversation,
5
5
  ConversationTurn,
6
6
  CorpusDatasetType,
7
+ EncodeKwargs,
7
8
  InstructionDatasetType,
8
9
  PromptType,
9
10
  QueryDatasetType,
@@ -30,6 +31,7 @@ __all__ = [
30
31
  "Conversation",
31
32
  "ConversationTurn",
32
33
  "CorpusDatasetType",
34
+ "EncodeKwargs",
33
35
  "HFSubset",
34
36
  "ISOLanguage",
35
37
  "ISOLanguageScript",
mteb/types/_encoder_io.py CHANGED
@@ -13,6 +13,18 @@ if TYPE_CHECKING:
13
13
  from PIL import Image
14
14
 
15
15
 
16
+ class EncodeKwargs(TypedDict):
17
+ """Keyword arguments for encoding methods.
18
+
19
+ Attributes:
20
+ batch_size: The batch size to use for encoding.
21
+ show_progress_bar: Whether to show a progress bar during encoding.
22
+ """
23
+
24
+ batch_size: NotRequired[int]
25
+ show_progress_bar: NotRequired[bool]
26
+
27
+
16
28
  # --- Output types ---
17
29
  Array = np.ndarray | torch.Tensor
18
30
  """General array type, can be a numpy array or a torch tensor."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.6.6
3
+ Version: 2.6.8
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>