mteb 2.7.15__py3-none-any.whl → 2.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1007,54 +1007,6 @@ thenlper__gte_small = ModelMeta(
1007
1007
  year={2023}
1008
1008
  }""",
1009
1009
  )
1010
- OrlikB__KartonBERT_USE_base_v1 = ModelMeta(
1011
- name="OrlikB/KartonBERT-USE-base-v1",
1012
- model_type=["dense"],
1013
- revision="1f59dd58fe57995c0e867d5e29f03763eae99645",
1014
- release_date="2024-09-30",
1015
- languages=["pol-Latn"],
1016
- loader=sentence_transformers_loader,
1017
- n_parameters=103705344,
1018
- n_embedding_parameters=None,
1019
- memory_usage_mb=396,
1020
- max_tokens=512.0,
1021
- embed_dim=768,
1022
- license="gpl-3.0",
1023
- open_weights=True,
1024
- public_training_code=None,
1025
- public_training_data=None,
1026
- framework=["PyTorch"],
1027
- reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1",
1028
- similarity_fn_name=ScoringFunction.COSINE,
1029
- use_instructions=None,
1030
- training_datasets=None,
1031
- adapted_from="KartonBERT-USE-base-v1",
1032
- superseded_by=None,
1033
- )
1034
- OrlikB__st_polish_kartonberta_base_alpha_v1 = ModelMeta(
1035
- name="OrlikB/st-polish-kartonberta-base-alpha-v1",
1036
- model_type=["dense"],
1037
- revision="5590a0e2d7bb43674e44d7076b3ff157f7d4a1cb",
1038
- release_date="2023-11-12",
1039
- languages=["pol-Latn"],
1040
- loader=sentence_transformers_loader,
1041
- n_parameters=None,
1042
- n_embedding_parameters=None,
1043
- memory_usage_mb=None,
1044
- max_tokens=514.0,
1045
- embed_dim=768,
1046
- license="lgpl",
1047
- open_weights=True,
1048
- public_training_code=None,
1049
- public_training_data=None,
1050
- framework=["PyTorch"],
1051
- reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1",
1052
- similarity_fn_name=ScoringFunction.COSINE,
1053
- use_instructions=None,
1054
- training_datasets=None,
1055
- adapted_from="st-polish-kartonberta-base-alpha-v1",
1056
- superseded_by=None,
1057
- )
1058
1010
  sdadas__mmlw_e5_base = ModelMeta(
1059
1011
  name="sdadas/mmlw-e5-base",
1060
1012
  model_type=["dense"],
@@ -103,68 +103,6 @@ class BGEReranker(RerankerWrapper):
103
103
  return scores
104
104
 
105
105
 
106
- class MonoBERTReranker(RerankerWrapper):
107
- name: str = "MonoBERT"
108
-
109
- def __init__(
110
- self,
111
- model_name_or_path="castorini/monobert-large-msmarco",
112
- torch_compile=False,
113
- **kwargs,
114
- ):
115
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
116
-
117
- super().__init__(model_name_or_path, **kwargs)
118
- if not self.device:
119
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
- model_args = {}
121
- if self.fp_options:
122
- model_args["torch_dtype"] = self.fp_options
123
- self.model = AutoModelForSequenceClassification.from_pretrained(
124
- model_name_or_path,
125
- **model_args,
126
- )
127
- self.model.to(self.device)
128
- self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
129
- self.max_length = self.tokenizer.model_max_length
130
- logger.info(f"Using max_length of {self.max_length}")
131
-
132
- self.model.eval()
133
-
134
- @torch.inference_mode()
135
- def predict(
136
- self,
137
- inputs1: DataLoader[BatchedInput],
138
- inputs2: DataLoader[BatchedInput],
139
- *,
140
- task_metadata: TaskMetadata,
141
- hf_split: str,
142
- hf_subset: str,
143
- prompt_type: PromptType | None = None,
144
- **kwargs: Any,
145
- ) -> Array:
146
- queries = [text for batch in inputs1 for text in batch["query"]]
147
- instructions = None
148
- if "instruction" in inputs2.dataset.features:
149
- instructions = [text for batch in inputs1 for text in batch["instruction"]]
150
- passages = [text for batch in inputs2 for text in batch["text"]]
151
-
152
- if instructions is not None and instructions[0] is not None:
153
- queries = [f"{q} {i}".strip() for i, q in zip(instructions, queries)]
154
-
155
- tokens = self.tokenizer(
156
- queries,
157
- passages,
158
- padding=True,
159
- truncation="only_second",
160
- return_tensors="pt",
161
- max_length=self.max_length,
162
- ).to(self.device)
163
- output = self.model(**tokens)[0]
164
- batch_scores = torch.nn.functional.log_softmax(output, dim=1)
165
- return batch_scores[:, 1].exp()
166
-
167
-
168
106
  class JinaReranker(RerankerWrapper):
169
107
  name = "Jina"
170
108
 
@@ -219,31 +157,6 @@ class JinaReranker(RerankerWrapper):
219
157
  return scores
220
158
 
221
159
 
222
- monobert_large = ModelMeta(
223
- loader=MonoBERTReranker,
224
- loader_kwargs=dict(
225
- fp_options="float16",
226
- ),
227
- name="castorini/monobert-large-msmarco",
228
- model_type=["cross-encoder"],
229
- languages=["eng-Latn"],
230
- open_weights=True,
231
- revision="0a97706f3827389da43b83348d5d18c9d53876fa",
232
- release_date="2020-05-28",
233
- n_parameters=None,
234
- n_embedding_parameters=31_254_528,
235
- memory_usage_mb=None,
236
- max_tokens=None,
237
- embed_dim=None,
238
- license=None,
239
- public_training_code=None,
240
- public_training_data=None,
241
- similarity_fn_name=None,
242
- use_instructions=None,
243
- training_datasets=None,
244
- framework=["Sentence Transformers", "PyTorch", "Transformers"],
245
- )
246
-
247
160
  # languages unclear: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual/discussions/28
248
161
  jina_reranker_multilingual = ModelMeta(
249
162
  loader=JinaReranker,
@@ -34,7 +34,6 @@ prediction_tokens = {
34
34
  "unicamp-dl/mt5-base-en-msmarco": ["▁no", "▁yes"],
35
35
  "unicamp-dl/mt5-base-mmarco-v2": ["▁no", "▁yes"],
36
36
  "unicamp-dl/mt5-base-mmarco-v1": ["▁no", "▁yes"],
37
- "unicamp-dl/mt5-13b-mmarco-100k": ["▁", "▁true"],
38
37
  }
39
38
 
40
39
 
@@ -919,28 +918,3 @@ mt5_base_mmarco_v2 = ModelMeta(
919
918
  use_instructions=None,
920
919
  framework=["PyTorch", "Transformers"],
921
920
  )
922
-
923
- mt5_13b_mmarco_100k = ModelMeta(
924
- loader=MonoT5Reranker,
925
- loader_kwargs=dict(
926
- fp_options="float16",
927
- ),
928
- name="unicamp-dl/mt5-13b-mmarco-100k",
929
- model_type=["cross-encoder"],
930
- languages=mt5_languages,
931
- open_weights=True,
932
- revision="e1a4317e102a525ea9e16745ad21394a4f1bffbc",
933
- release_date="2022-11-04",
934
- n_parameters=None,
935
- n_embedding_parameters=1_024_458_752,
936
- memory_usage_mb=None,
937
- max_tokens=None,
938
- embed_dim=None,
939
- license=None,
940
- public_training_code=None,
941
- public_training_data=None,
942
- similarity_fn_name=None,
943
- use_instructions=None,
944
- training_datasets=None,
945
- framework=["PyTorch", "Transformers"],
946
- )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.7.15
3
+ Version: 2.7.16
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -1556,7 +1556,7 @@ mteb/models/model_implementations/llm2clip_models.py,sha256=X3W16uipaZ0t4Mco4lhh
1556
1556
  mteb/models/model_implementations/llm2vec_models.py,sha256=n86YQ8fAHU1gVtlY7tZcXq-1ab_ISxBmuk-X4MDnY4o,13348
1557
1557
  mteb/models/model_implementations/mcinext_models.py,sha256=T3vO9JQSmh3BICp6Y_q7j4anuA8P8LGZ4ZWnwGnF7cs,19299
1558
1558
  mteb/models/model_implementations/mdbr_models.py,sha256=AqsRZ-IDekIjq-FDWu0zx7Nk9ySJxaWTdRb8YhUZeu4,2828
1559
- mteb/models/model_implementations/misc_models.py,sha256=0FkvheqPYh3JwM65F4CDlQKBDQQdjyMyfJPUdP1X2Ns,74780
1559
+ mteb/models/model_implementations/misc_models.py,sha256=JkJsyha-B5M8myLvHIwFUV14yo2lnSuBzHeO5fE9i74,73191
1560
1560
  mteb/models/model_implementations/mixedbread_ai_models.py,sha256=1-RD4M-16M-Rcf5CTD_R7LVoLv3cNFbmEjataQ__q94,10666
1561
1561
  mteb/models/model_implementations/mme5_models.py,sha256=V7BCGFkfZxkZ3ANJImvSFfP7in8OSfmkbqX-zXc_iF8,1574
1562
1562
  mteb/models/model_implementations/moco_models.py,sha256=6eEGpGTlI4StFRYsaNtXejhYE9GCqasUYCqB_SQy9cE,5714
@@ -1590,8 +1590,8 @@ mteb/models/model_implementations/random_baseline.py,sha256=YsITQoLbea_Iz2X84WNG
1590
1590
  mteb/models/model_implementations/rasgaard_models.py,sha256=_uNYP_nqJcOyoKnHNcvfJnP9gRvsv7HCWhZX2LJzQ9s,1322
1591
1591
  mteb/models/model_implementations/reasonir_model.py,sha256=WNWGqa9wANBL9vTdcFx51TEFXz6yHq_ygK0rij3LCL8,5217
1592
1592
  mteb/models/model_implementations/repllama_models.py,sha256=k6BgN2Cn41p0gQ0F1FdOTQ9OXlmFgG-2RtdvzOcCSZg,7543
1593
- mteb/models/model_implementations/rerankers_custom.py,sha256=Bjgg_UbeHarupzzCk2rdy_Dd0_W0ZsE-DCD5v1EshnI,10953
1594
- mteb/models/model_implementations/rerankers_monot5_based.py,sha256=6por4DPCycS8gljqKRZWUNM093bjjSVvmyQ3dzj9H6U,35321
1593
+ mteb/models/model_implementations/rerankers_custom.py,sha256=WBSA7kBRqxgb1549UwRYdtYzUovdwmW8C0PWzvGR54g,8087
1594
+ mteb/models/model_implementations/rerankers_monot5_based.py,sha256=U9ChokUEDXtkoFno-o4GeT4fXEEoFtnZn2denIafxi8,34583
1595
1595
  mteb/models/model_implementations/richinfoai_models.py,sha256=FsXamY-bvR5LLagtKK8fP-I5oc6B_bKp_i6_xzUYL8Y,1069
1596
1596
  mteb/models/model_implementations/ru_sentence_models.py,sha256=W4R985LnThJ-9XFbPnTGKb3L1QnoS3i3VXBFq94DK_w,43034
1597
1597
  mteb/models/model_implementations/ruri_models.py,sha256=3zYOqacB3JEnGJkMGYHqFgVkbmLo4uceJs9kzV54ivU,10819
@@ -2646,9 +2646,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
2646
2646
  mteb/types/_result.py,sha256=UKNokV9pu3G74MGebocU512aU_fFU9I9nPKnrG9Q0iE,1035
2647
2647
  mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
2648
2648
  mteb/types/statistics.py,sha256=gElgSShKBXpfcqaZHhU_d2UHln1CyzUj8FN8KFun_UA,4087
2649
- mteb-2.7.15.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2650
- mteb-2.7.15.dist-info/METADATA,sha256=EoUeroRRdre5jYbplBGCJuWs-6M7cZGpzwLqSQyJKgI,14348
2651
- mteb-2.7.15.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
2652
- mteb-2.7.15.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2653
- mteb-2.7.15.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2654
- mteb-2.7.15.dist-info/RECORD,,
2649
+ mteb-2.7.16.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2650
+ mteb-2.7.16.dist-info/METADATA,sha256=a-Rt1xa9ZgNdKf-JlM6EUZE_pKzEHoT6KGpFZUvnPo0,14348
2651
+ mteb-2.7.16.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
2652
+ mteb-2.7.16.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2653
+ mteb-2.7.16.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2654
+ mteb-2.7.16.dist-info/RECORD,,
File without changes