mteb 2.1.3__py3-none-any.whl → 2.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,29 +1,29 @@
1
1
  {
2
2
  "test": {
3
- "num_samples": 4872,
4
- "number_of_characters": 9352943,
3
+ "num_samples": 6362,
4
+ "number_of_characters": 180770,
5
5
  "documents_text_statistics": {
6
- "total_text_length": 8957572,
7
- "min_text_length": 8,
8
- "average_text_length": 3504.527386541471,
9
- "max_text_length": 47929,
10
- "unique_texts": 2556
6
+ "total_text_length": 39142,
7
+ "min_text_length": 2,
8
+ "average_text_length": 7.68243375858685,
9
+ "max_text_length": 31,
10
+ "unique_texts": 5095
11
11
  },
12
12
  "documents_image_statistics": null,
13
13
  "queries_text_statistics": {
14
- "total_text_length": 395371,
15
- "min_text_length": 8,
16
- "average_text_length": 170.71286701208982,
17
- "max_text_length": 2863,
18
- "unique_texts": 2316
14
+ "total_text_length": 141628,
15
+ "min_text_length": 79,
16
+ "average_text_length": 111.78216258879242,
17
+ "max_text_length": 185,
18
+ "unique_texts": 1267
19
19
  },
20
20
  "queries_image_statistics": null,
21
21
  "relevant_docs_statistics": {
22
- "num_relevant_docs": 2316,
22
+ "num_relevant_docs": 1267,
23
23
  "min_relevant_docs_per_query": 1,
24
24
  "average_relevant_docs_per_query": 1.0,
25
25
  "max_relevant_docs_per_query": 1,
26
- "unique_relevant_docs": 988
26
+ "unique_relevant_docs": 478
27
27
  },
28
28
  "top_ranked_statistics": null
29
29
  }
@@ -0,0 +1,58 @@
1
+ from mteb.models.model_implementations.google_models import gemma_embedding_loader
2
+ from mteb.models.model_meta import ModelMeta
3
+
4
+ Tarka_Embedding_150M_V1_CITATION = """@misc{tarka_ai_research_2025,
5
+ author = { Tarka AI Research },
6
+ title = { Tarka-Embedding-150M-V1 (Revision c5f4f43) },
7
+ year = 2025,
8
+ url = { https://huggingface.co/Tarka-AIR/Tarka-Embedding-150M-V1 },
9
+ doi = { 10.57967/hf/6875 },
10
+ publisher = { Hugging Face }
11
+ }"""
12
+
13
+ MULTILINGUAL_EVALUATED_LANGUAGES = [
14
+ "arb-Arab",
15
+ "eng-Latn",
16
+ "spa-Latn",
17
+ "deu-Latn",
18
+ "fra-Latn",
19
+ "jpn-Jpan",
20
+ "kor-Hang",
21
+ "zho-Hans",
22
+ ]
23
+
24
+ training_data = {
25
+ "T2Retrieval",
26
+ "DuRetrieval",
27
+ "MMarcoReranking",
28
+ "CMedQAv2-reranking",
29
+ "NQ",
30
+ "MSMARCO",
31
+ "HotpotQA",
32
+ "FEVER",
33
+ "MrTidyRetrieval",
34
+ "MIRACLRetrieval",
35
+ "CodeSearchNet",
36
+ }
37
+
38
+ tarka_embedding_150m_v1 = ModelMeta(
39
+ loader=gemma_embedding_loader,
40
+ name="Tarka-AIR/Tarka-Embedding-150M-V1",
41
+ languages=MULTILINGUAL_EVALUATED_LANGUAGES,
42
+ open_weights=True,
43
+ revision="c5f4f43",
44
+ release_date="2025-11-04",
45
+ n_parameters=155_714_304,
46
+ embed_dim=768,
47
+ max_tokens=2048,
48
+ license="gemma",
49
+ reference="https://huggingface.co/Tarka-AIR/Tarka-Embedding-150M-V1",
50
+ framework=["Sentence Transformers", "PyTorch"],
51
+ use_instructions=True,
52
+ public_training_code=None,
53
+ public_training_data=None,
54
+ training_datasets=training_data,
55
+ similarity_fn_name="cosine",
56
+ memory_usage_mb=576,
57
+ citation=Tarka_Embedding_150M_V1_CITATION,
58
+ )
@@ -156,16 +156,15 @@ class VoyageModel(AbsEncoder):
156
156
  and len(batch) < batch_size
157
157
  and batch_tokens < self._max_tokens_per_batch
158
158
  ):
159
- n_tokens = len(
160
- self._client.tokenize([sentences[index]], model=self._model_name)[0]
161
- )
159
+ txt = sentences[index] if len(sentences[index]) > 0 else " "
160
+ n_tokens = len(self._client.tokenize([txt], model=self._model_name)[0])
162
161
  if (
163
162
  batch_tokens + n_tokens > self._max_tokens_per_batch
164
163
  and len(batch) > 0
165
164
  ):
166
165
  break
167
166
  batch_tokens += n_tokens
168
- batch.append(sentences[index])
167
+ batch.append(txt)
169
168
  index += 1
170
169
 
171
170
  embeddings.extend(
@@ -249,7 +248,7 @@ voyage_3_5 = ModelMeta(
249
248
  n_parameters=None,
250
249
  memory_usage_mb=None,
251
250
  license=None,
252
- reference="https://docs.voyageai.com/docs/embeddings",
251
+ reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
253
252
  similarity_fn_name="cosine",
254
253
  framework=["API"],
255
254
  use_instructions=True,
@@ -274,7 +273,7 @@ voyage_3_5_int8 = ModelMeta(
274
273
  n_parameters=None,
275
274
  memory_usage_mb=None,
276
275
  license=None,
277
- reference="https://docs.voyageai.com/docs/flexible-dimensions-and-quantization",
276
+ reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
278
277
  similarity_fn_name="cosine",
279
278
  framework=["API"],
280
279
  use_instructions=True,
@@ -300,7 +299,7 @@ voyage_3_5_binary = ModelMeta(
300
299
  n_parameters=None,
301
300
  memory_usage_mb=None,
302
301
  license=None,
303
- reference="https://docs.voyageai.com/docs/flexible-dimensions-and-quantization",
302
+ reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
304
303
  similarity_fn_name="cosine",
305
304
  framework=["API"],
306
305
  use_instructions=True,
@@ -191,6 +191,7 @@ class SearchEncoderWrapper:
191
191
  cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
192
192
  cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
193
193
 
194
+ sub_corpus_ids = list(sub_corpus_ids)
194
195
  for query_itr in range(len(query_embeddings)):
195
196
  query_id = query_idx_to_id[query_itr]
196
197
  for sub_corpus_id, score in zip(
@@ -9,7 +9,7 @@ class WinoGrande(AbsTaskRetrieval):
9
9
  reference="https://winogrande.allenai.org/",
10
10
  dataset={
11
11
  "path": "mteb/WinoGrande",
12
- "revision": "770abbd7f77affc005f9734996e795925cbc0f65",
12
+ "revision": "4dec9c5666e9f84702ac614363db6d96a68bc6de",
13
13
  },
14
14
  type="Retrieval",
15
15
  category="t2t",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.1.3
3
+ Version: 2.1.5
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -1340,7 +1340,7 @@ mteb/descriptive_stats/Retrieval/VieQuADRetrieval.json,sha256=NHt0U-wJXBOPYOki5Y
1340
1340
  mteb/descriptive_stats/Retrieval/WebFAQRetrieval.json,sha256=uNkLCfiuVbdNKZ54vYGo8dPaoBlTShexDjl_IwCcT_M,60318
1341
1341
  mteb/descriptive_stats/Retrieval/WikiSQLRetrieval.json,sha256=JinCBRnmfMDeIwQtQfD6bD8MYNEpUgedw05A6G-W7u4,985
1342
1342
  mteb/descriptive_stats/Retrieval/WikipediaRetrievalMultilingual.json,sha256=v9npJOOJrcBUdRQ8EzAbOrpynAoSAJBaJoSJhS-qkww,20357
1343
- mteb/descriptive_stats/Retrieval/WinoGrande.json,sha256=--CmwZnUf7EBw01t8yz2UGc8ifsMKtSBT-QIiMRkKm0,989
1343
+ mteb/descriptive_stats/Retrieval/WinoGrande.json,sha256=czzrnqg24MPfCrlVPLamL4LvMdWWOZC4R4tJYjc1QAA,982
1344
1344
  mteb/descriptive_stats/Retrieval/XMarket.json,sha256=s0odo5MWwzgQi0HwqK4APYhWIBR8WEtiC8KKt7wgzOc,4770
1345
1345
  mteb/descriptive_stats/Retrieval/XPQARetrieval.json,sha256=qHxeCggIwW1iw5ujbHsCc_7rf_-JIhfYRFGEKvzSTO0,44958
1346
1346
  mteb/descriptive_stats/Retrieval/XQuADRetrieval.json,sha256=NjNvrloDl561L_WTWBg6fQ31yBTZInYWL-SW0q35SfA,15462
@@ -1419,7 +1419,7 @@ mteb/models/get_model_meta.py,sha256=VpZZNINk-QrNeVpPZnlqzlLhtBs8G84eRwTzAb_gRD4
1419
1419
  mteb/models/instruct_wrapper.py,sha256=HxHmnlxkjtZhfgTZRYJBT3Nma7Dhx6a9e2Bg-cO_IYs,8844
1420
1420
  mteb/models/model_meta.py,sha256=b-Nel9nX5bJk4cgJnqkBzEKyMY7uXvxlCBSxmmH1Ios,14769
1421
1421
  mteb/models/models_protocols.py,sha256=D2hYWn_UBGMaKtRwBx3u0B0ni6lHJjSzTxX21XFNwIc,8917
1422
- mteb/models/search_wrappers.py,sha256=0McxwGnqyiYKPHjHsxWZp1pP9qGuHemZjeX1z5ZgNAI,15588
1422
+ mteb/models/search_wrappers.py,sha256=W99EeMDQ58N0auVp2-7T39orG7GZLekSsVCdxEZLxw0,15638
1423
1423
  mteb/models/sentence_transformer_wrapper.py,sha256=n5CMsM6Lpg_CFHH0NkpJusMsaLUTt-L9vRmFINQ961k,12338
1424
1424
  mteb/models/cache_wrappers/__init__.py,sha256=j3JBHN73Tr7uMUO92FEvKXstnybxrPpGWmKXU2lAoIE,88
1425
1425
  mteb/models/cache_wrappers/cache_backend_protocol.py,sha256=TR7kD7KbN1J4piszIecpegtLZYGy7sRHZt3SDWlImKk,1665
@@ -1521,6 +1521,7 @@ mteb/models/model_implementations/shuu_model.py,sha256=KkcuVYjIzoha3Fvxh8ppqHQ9B
1521
1521
  mteb/models/model_implementations/siglip_models.py,sha256=tvi8QB2ayBoeXsxwHrl5RFlkknvE6FM9N06zSBWGQD0,12602
1522
1522
  mteb/models/model_implementations/sonar_models.py,sha256=Nc6kAJRWSrxA57DPRrgOPHqS1dNhz2vsE_1ZA2JtigQ,4784
1523
1523
  mteb/models/model_implementations/stella_models.py,sha256=NL3tk-rnuBdznsQ-nmelqun4tFO2xKoNPPOOVKqnPGU,8062
1524
+ mteb/models/model_implementations/tarka_models.py,sha256=aj4PvEzZ6ZSKcvwYVuTxf1IFOvH4rmJHtbPUcRw1fMI,1568
1524
1525
  mteb/models/model_implementations/text2vec_models.py,sha256=zaHWRc2W0RYZAOetinqRzug9UGW0HmY5U-jYsLXA8wo,4160
1525
1526
  mteb/models/model_implementations/ua_sentence_models.py,sha256=fcvXR4-Rrt-UDTlDkh2ZAO1gO_ufCOHiT6EhoeKiHx8,1224
1526
1527
  mteb/models/model_implementations/uae_models.py,sha256=KZxH5a3t-sfh33xUBkLizEuyFAyPlGfnRsn-S7mjq74,3112
@@ -1528,7 +1529,7 @@ mteb/models/model_implementations/vdr_models.py,sha256=lMm43BBPjZU5lxZcpmPZ8hn0P
1528
1529
  mteb/models/model_implementations/vi_vn_models.py,sha256=quWmd3JT2J6SlAsFrV2gcnc67M9zr58mEF2zLUF8-uw,4795
1529
1530
  mteb/models/model_implementations/vista_models.py,sha256=Q3I01kRtIPaoke0iMIcH4CLcCDTnMSIBFNCof7LPTX4,10832
1530
1531
  mteb/models/model_implementations/vlm2vec_models.py,sha256=HGGy_-z9Wc99xOKum71rBNipCPqWcM1efmmXgy5Rvxc,11724
1531
- mteb/models/model_implementations/voyage_models.py,sha256=Qn9foyBmAGuuhispwZTFQb5ZtDM-OWMif5Ca4PBEhcw,19842
1532
+ mteb/models/model_implementations/voyage_models.py,sha256=dOCccOQlloGrg0q44PxMQzx8dHuQ8VgkDUD01EydpJ0,19824
1532
1533
  mteb/models/model_implementations/voyage_v.py,sha256=6i-oFnaY2D2qR1Dgb0B98ougnD1ujW9aNG9QoWyvwwY,8041
1533
1534
  mteb/models/model_implementations/xyz_models.py,sha256=TePlrH6EHwRPO87U_J3Yce9-XHCn_X7I2cJ_6BZ2fUY,1296
1534
1535
  mteb/models/model_implementations/youtu_models.py,sha256=NB74E6z-_36HyXb8GXKn8CrmRLN68uX9eH4xcS57zl0,5938
@@ -2283,7 +2284,7 @@ mteb/tasks/retrieval/eng/viz_wiz_it2t_retrieval.py,sha256=jE70T5If62lkKnbF-CMAgR
2283
2284
  mteb/tasks/retrieval/eng/vqa2_it2t_retrieval.py,sha256=M_g6Y6OrNRByD52-JxuO8iIO8aFUg8HHg5BxQ31-m1I,1403
2284
2285
  mteb/tasks/retrieval/eng/web_qa_t2it_retrieval.py,sha256=c7pJja_ii4ku9pfd-Gd3FqO6cF-0IIEb_H0FRY2A69w,1477
2285
2286
  mteb/tasks/retrieval/eng/web_qa_t2t_retrieval.py,sha256=rx6uoqc8yduGhuvdv2K5v2oFiQI8jP-BEt5nmaKrsac,1517
2286
- mteb/tasks/retrieval/eng/wino_grande_retrieval.py,sha256=bRjNxz_pgH7waI7m_NMR_VLvcRxnBtMXY6CTU4RLEnc,1714
2287
+ mteb/tasks/retrieval/eng/wino_grande_retrieval.py,sha256=ou8TlZ-JPS1nh7NS7OeerUsB2WRZWWwKTuygpJNLb2A,1714
2287
2288
  mteb/tasks/retrieval/est/__init__.py,sha256=uNkOSKfZsO1F-xC4twL8ukxtfrI4A4eIU-oAs3Hi5Dg,46
2288
2289
  mteb/tasks/retrieval/est/estqa.py,sha256=sORL3KI47yXOy8GXptBtCuryOdDShdRDFpCdnnIaaCI,1418
2289
2290
  mteb/tasks/retrieval/fas/__init__.py,sha256=DUq1CTC_nj-201dbUNqlmqN-oR-YKjeW3O8DhtMX9rk,2213
@@ -2536,9 +2537,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
2536
2537
  mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
2537
2538
  mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
2538
2539
  mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
2539
- mteb-2.1.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2540
- mteb-2.1.3.dist-info/METADATA,sha256=fSSp_uFAo3SSrjhhMQGHJMDIyEjeinv-7QjmK11VrPQ,13573
2541
- mteb-2.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2542
- mteb-2.1.3.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2543
- mteb-2.1.3.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2544
- mteb-2.1.3.dist-info/RECORD,,
2540
+ mteb-2.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2541
+ mteb-2.1.5.dist-info/METADATA,sha256=VunL5iKfFZLfKB-4Yruzd5vq31mhfNOT0JVjs2GibSQ,13573
2542
+ mteb-2.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2543
+ mteb-2.1.5.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2544
+ mteb-2.1.5.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2545
+ mteb-2.1.5.dist-info/RECORD,,
File without changes