mteb 2.1.17__py3-none-any.whl → 2.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ import numpy as np
2
+
3
+ from mteb.models.model_implementations.model2vec_models import Model2VecModel
4
+ from mteb.models.model_meta import ModelMeta, ScoringFunction
5
+
6
+ model2vecdk = ModelMeta(
7
+ loader=Model2VecModel, # type: ignore
8
+ name="andersborges/model2vecdk",
9
+ languages=["dan-Latn"],
10
+ open_weights=True,
11
+ revision="cb576c78dcc1b729e4612645f61db59929d69e61",
12
+ release_date="2025-11-21",
13
+ n_parameters=48042496,
14
+ memory_usage_mb=183,
15
+ max_tokens=np.inf,
16
+ embed_dim=256,
17
+ license="mit",
18
+ similarity_fn_name=ScoringFunction.COSINE,
19
+ framework=["NumPy", "Sentence Transformers"],
20
+ reference="https://huggingface.co/andersborges/model2vecdk",
21
+ use_instructions=False,
22
+ adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
23
+ superseded_by=None,
24
+ training_datasets=set(), # distilled
25
+ public_training_code="https://github.com/andersborges/dkmodel2vec",
26
+ public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
27
+ )
28
+
29
+
30
+ model2vecdk_stem = ModelMeta(
31
+ loader=Model2VecModel, # type: ignore
32
+ name="andersborges/model2vecdk-stem",
33
+ languages=["dan-Latn"],
34
+ open_weights=True,
35
+ revision="cb576c78dcc1b729e4612645f61db59929d69e61",
36
+ release_date="2025-11-21",
37
+ n_parameters=48578560,
38
+ memory_usage_mb=185,
39
+ max_tokens=np.inf,
40
+ embed_dim=256,
41
+ license="mit",
42
+ similarity_fn_name=ScoringFunction.COSINE,
43
+ framework=["NumPy", "Sentence Transformers"],
44
+ reference="https://huggingface.co/andersborges/model2vecdk",
45
+ use_instructions=False,
46
+ adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
47
+ superseded_by=None,
48
+ training_datasets=set(), # distilled
49
+ public_training_code="https://github.com/andersborges/dkmodel2vec",
50
+ public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
51
+ )
@@ -221,7 +221,7 @@ class CohereTextEmbeddingModel(AbsEncoder):
221
221
  ) -> None:
222
222
  import cohere # type: ignore
223
223
 
224
- self.model_name = model_name.lstrip("Cohere/Cohere-")
224
+ self.model_name = model_name.removeprefix("Cohere/Cohere-")
225
225
  self.sep = sep
226
226
  self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
227
227
  if embedding_type not in get_args(EmbeddingType):
@@ -342,7 +342,6 @@ tarka_embedding_150m_v1 = ModelMeta(
342
342
 
343
343
  tark_embedding_350_v1_kwargs = dict(
344
344
  model_kwargs={
345
- "attn_implementation": "flash_attention_2",
346
345
  "torch_dtype": "bfloat16",
347
346
  }, # use low-precision
348
347
  trust_remote_code=True,
@@ -357,7 +356,7 @@ tarka_embedding_350m_v1 = ModelMeta(
357
356
  name="Tarka-AIR/Tarka-Embedding-350M-V1",
358
357
  languages=MULTILINGUAL_EVALUATED_LANGUAGES,
359
358
  open_weights=True,
360
- revision="f4b5de82060cf3a833e52580e7ce59adeacb6fb5",
359
+ revision="a850d6a329145474727424fed6b12b62096b8ba3",
361
360
  release_date="2025-11-11",
362
361
  n_parameters=354_483_968,
363
362
  memory_usage_mb=676,
@@ -23,7 +23,7 @@ class BUCCBitextMining(AbsTaskBitextMining):
23
23
  "path": "mteb/BUCC",
24
24
  "revision": "414572247440f0ccacf7eb0bb70a31533a0e5443",
25
25
  },
26
- description="BUCC bitext mining dataset",
26
+ description="BUCC bitext mining dataset train split.",
27
27
  reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
28
28
  type="BitextMining",
29
29
  category="t2t",
@@ -71,7 +71,9 @@ Rapp, Reinhard},
71
71
 
72
72
  sentence1 = data["sentence1"][0]
73
73
  sentence2 = data["sentence2"][0]
74
- sentence1 = [sentence1[i] for (i, j) in gold]
74
+ sentence1 = [
75
+ sentence1[i] for (i, j) in gold
76
+ ] # keep only sentences in gold. The 2nd value is meant for sentence2 but not used here. This is fixed in BUCC.v2.
75
77
  logger.info(f"Lang {lang} num gold {len(gold)}")
76
78
  logger.info(f"Lang {lang} num sentence1 {len(sentence1)}")
77
79
  logger.info(f"Lang {lang} num sentence2 {len(sentence2)}")
@@ -20,7 +20,7 @@ class BUCCBitextMiningFast(AbsTaskBitextMining):
20
20
  "path": "mteb/bucc-bitext-mining",
21
21
  "revision": "1739dc11ffe9b7bfccd7f3d585aeb4c544fc6677",
22
22
  },
23
- description="BUCC bitext mining dataset",
23
+ description="BUCC bitext mining dataset train split, gold set only.",
24
24
  reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
25
25
  type="BitextMining",
26
26
  category="t2t",
@@ -26,5 +26,20 @@ class GreenNodeTableMarkdownRetrieval(AbsTaskRetrieval):
26
26
  annotations_creators="human-annotated",
27
27
  dialect=[],
28
28
  sample_creation="found",
29
- bibtex_citation="", # TODO: Add bibtex citation when the paper is published
29
+ bibtex_citation=r"""
30
+ @inproceedings{10.1007/978-981-95-1746-6_17,
31
+ abstract = {Information retrieval often comes in plain text, lacking semi-structured text such as HTML and markdown, retrieving data that contains rich format such as table became non-trivial. In this paper, we tackle this challenge by introducing a new dataset, GreenNode Table Retrieval VN (GN-TRVN), which is collected from a massive corpus, a wide range of topics, and a longer context compared to ViQuAD2.0. To evaluate the effectiveness of our proposed dataset, we introduce two versions, M3-GN-VN and M3-GN-VN-Mixed, by fine-tuning the M3-Embedding model on this dataset. Experimental results show that our models consistently outperform the baselines, including the base model, across most evaluation criteria on various datasets such as VieQuADRetrieval, ZacLegalTextRetrieval, and GN-TRVN. In general, we release a more comprehensive dataset and two model versions that improve response performance for Vietnamese Markdown Table Retrieval.},
32
+ address = {Singapore},
33
+ author = {Pham, Bao Loc
34
+ and Hoang, Quoc Viet
35
+ and Luu, Quy Tung
36
+ and Vo, Trong Thu},
37
+ booktitle = {Proceedings of the Fifth International Conference on Intelligent Systems and Networks},
38
+ isbn = {978-981-95-1746-6},
39
+ pages = {153--163},
40
+ publisher = {Springer Nature Singapore},
41
+ title = {GN-TRVN: A Benchmark for Vietnamese Table Markdown Retrieval Task},
42
+ year = {2026},
43
+ }
44
+ """,
30
45
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mteb
3
- Version: 2.1.17
3
+ Version: 2.1.18
4
4
  Summary: Massive Text Embedding Benchmark
5
5
  Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
6
6
  Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
@@ -1445,6 +1445,7 @@ mteb/models/cache_wrappers/cache_backends/numpy_cache.py,sha256=GyTVC5DLph3EeRnD
1445
1445
  mteb/models/model_implementations/__init__.py,sha256=BZDdde6ajKv-yroy9mqE2YS3Hw1KBdKoxBPg8aPTZEs,1164
1446
1446
  mteb/models/model_implementations/align_models.py,sha256=DUdVWxETiwC2IrXI90zQwlvHMjeI7JJCNOmFVd2RNws,4518
1447
1447
  mteb/models/model_implementations/amazon_models.py,sha256=pdRU2QGAB5ccQnAfbRSzHE1G3ZUdjvsAgeJwkB_olDQ,694
1448
+ mteb/models/model_implementations/andersborges.py,sha256=QUFpASdcCy-IMz2O2C3OAOhMWA2ksNHM4GFWlkELIT4,1879
1448
1449
  mteb/models/model_implementations/ara_models.py,sha256=zS0t9rI21wwEwTlrlX94GqkmPKLnb8ktUaAOY-ZLmw0,1421
1449
1450
  mteb/models/model_implementations/arctic_models.py,sha256=eaMRaN9WdpVq1W6cbtNcJMdrJUTXrTSYUjTJufCdZRY,10350
1450
1451
  mteb/models/model_implementations/b1ade_models.py,sha256=aEKmXWVX8iJ_OotAYPOMxsOHTDEOJYdSwkR6iJsZ-ms,1609
@@ -1459,7 +1460,7 @@ mteb/models/model_implementations/cde_models.py,sha256=3nNU3nq3VZZcImFqH1VPj57-Q
1459
1460
  mteb/models/model_implementations/clip_models.py,sha256=zrfgNmZszu0JMtMNdCMzEohixsrnQ7xFhCqgsiucH_Q,6107
1460
1461
  mteb/models/model_implementations/codefuse_models.py,sha256=19Y-d_qetVU64quzEvuUJ_K8DHo1JEEKEGqjRR48dFg,9113
1461
1462
  mteb/models/model_implementations/codesage_models.py,sha256=D4CdISGyv5f2GMYq4_efgm5qNq80SWAX5R2u5mjEiXM,2998
1462
- mteb/models/model_implementations/cohere_models.py,sha256=H7Mjn57kmeBhIaJx6riaCEEgZS01YBWBBGPagEU87ZQ,13690
1463
+ mteb/models/model_implementations/cohere_models.py,sha256=LiYYRT3clhFlh0RE654KyZtO66vnIO22h79HJLmXYwk,13696
1463
1464
  mteb/models/model_implementations/cohere_v.py,sha256=K6VEw1NkyM2PuMd18kHE6aqPrcByYSwEmAKjvLods_w,15760
1464
1465
  mteb/models/model_implementations/colpali_models.py,sha256=uVmK3jXO-GDn-7i6cJFWdc0u0-MU3INHHL1rXUPhBec,8944
1465
1466
  mteb/models/model_implementations/colqwen_models.py,sha256=Y6IBhYKbxjkC3AePa0l37-F50xcX1mtPCXpgW597HyA,8754
@@ -1538,7 +1539,7 @@ mteb/models/model_implementations/siglip_models.py,sha256=tvi8QB2ayBoeXsxwHrl5RF
1538
1539
  mteb/models/model_implementations/sonar_models.py,sha256=Nc6kAJRWSrxA57DPRrgOPHqS1dNhz2vsE_1ZA2JtigQ,4784
1539
1540
  mteb/models/model_implementations/spartan8806_atles_champion.py,sha256=9sWQH7tOT0uxXA7sbQcnqGt2f5O9xcw9HqFpRCzoQAA,918
1540
1541
  mteb/models/model_implementations/stella_models.py,sha256=NL3tk-rnuBdznsQ-nmelqun4tFO2xKoNPPOOVKqnPGU,8062
1541
- mteb/models/model_implementations/tarka_models.py,sha256=xC6olJs9PSe_lrYsScw5hDHTjYSjcxgbvfK_7IoBFnk,27397
1542
+ mteb/models/model_implementations/tarka_models.py,sha256=UwSb3e-k7dCgQAJv3176ZvKpkjLZfpdPzwf-b0Oxuuo,27345
1542
1543
  mteb/models/model_implementations/text2vec_models.py,sha256=zaHWRc2W0RYZAOetinqRzug9UGW0HmY5U-jYsLXA8wo,4160
1543
1544
  mteb/models/model_implementations/ua_sentence_models.py,sha256=fcvXR4-Rrt-UDTlDkh2ZAO1gO_ufCOHiT6EhoeKiHx8,1224
1544
1545
  mteb/models/model_implementations/uae_models.py,sha256=KZxH5a3t-sfh33xUBkLizEuyFAyPlGfnRsn-S7mjq74,3112
@@ -1581,8 +1582,8 @@ mteb/tasks/bitext_mining/kat/__init__.py,sha256=a-KcFJ3Ol7R8yq02RcGjaOxEfqnwJeo7
1581
1582
  mteb/tasks/bitext_mining/kat/tbilisi_city_hall_bitext_mining.py,sha256=xVCxpJr7UW2KadNdn7Gsw-wZ65uz5vhRDhQZ7eILokQ,1918
1582
1583
  mteb/tasks/bitext_mining/multilingual/__init__.py,sha256=qDgixbHEvV3xz6JN3kFQDnvtSL-yVjl-Z8inUwimI6I,1954
1583
1584
  mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py,sha256=lNbCz3dN9o3F04Y7vtNBhF-lPUNyVbAOKgUR-QKZn_8,29082
1584
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py,sha256=lg-4of5K_6mn7iU-TpxwP_HFRk10qILJWprR-QX9Jug,2708
1585
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py,sha256=Zv6QCyTchTmlgWegS-iNJKfcc1Tr-ZmGftub5xZO5-w,1808
1585
+ mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py,sha256=tTKvS-v7d9V_zymCn_ZonUKlo9NI7vTyppxS9iAu8I0,2873
1586
+ mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py,sha256=P_UHMWh2gKG6CloXmP5J2kjrCTQwoJAU1MKdLl6JFKc,1836
1586
1587
  mteb/tasks/bitext_mining/multilingual/danish_medicines_agency_bitext_mining.py,sha256=5iengckKv1NCHILjrX6WDEgBNJlmbSV5y-WWdaLZYrs,1703
1587
1588
  mteb/tasks/bitext_mining/multilingual/diabla_bitext_mining.py,sha256=Ua6DfpJYgoaIRSWB284WNj6wQWqxPiC3kUPcmKNGDWQ,1498
1588
1589
  mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py,sha256=Dh43H2NWZIAzN9Zeib3TtKhUcK1jldUQX-GvTsm1MnI,5616
@@ -2441,7 +2442,7 @@ mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py,sha256=ppFPam-3A
2441
2442
  mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py,sha256=hOiwz2bcayDW6VrCvsIGeYh1TT7koByM76rZZwtp9KA,1754
2442
2443
  mteb/tasks/retrieval/vie/fevervn_retrieval.py,sha256=xLGoXefGk1l1AFiOSf2Ja0fM_rAQp4tdaR8H6jJqYlI,1853
2443
2444
  mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py,sha256=FGfFuLzRCTuupRxZdjVbBiwCOSspb3vwvtNAKvyXjso,1714
2444
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py,sha256=AlvGHXqXwOTrEvZNcMxlWL_2a31iNrrZzNz2i6dcJec,1074
2445
+ mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py,sha256=O7iIcuvqhrHjB7J1VxH9YJ3v6cuFFBQdrrnYwLgeRfE,2429
2445
2446
  mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py,sha256=FYWj8EhnfwDuPRxZ8uTeGkfa2Q-jDU2bliTmp975Coc,1837
2446
2447
  mteb/tasks/retrieval/vie/msmarcovn_retrieval.py,sha256=xtJ1-rjx4slwSR8p6NedqItTk-79ZzT2f9FlDOhbzkE,1958
2447
2448
  mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py,sha256=4S8IDJ-TVjKEy2teM8GOeDzHIZR8txkPvX0sGDYIyqs,1780
@@ -2555,9 +2556,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
2555
2556
  mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
2556
2557
  mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
2557
2558
  mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
2558
- mteb-2.1.17.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2559
- mteb-2.1.17.dist-info/METADATA,sha256=HZ7_vl0KWjqAR65easMGKslMQs6wsc3lIpZZfR_yi5w,13574
2560
- mteb-2.1.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2561
- mteb-2.1.17.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2562
- mteb-2.1.17.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2563
- mteb-2.1.17.dist-info/RECORD,,
2559
+ mteb-2.1.18.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2560
+ mteb-2.1.18.dist-info/METADATA,sha256=OF7o0Df2GbEQuuNXdXxByJR0atmGv4XdRrwMSWuOcx0,13574
2561
+ mteb-2.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2562
+ mteb-2.1.18.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
2563
+ mteb-2.1.18.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
2564
+ mteb-2.1.18.dist-info/RECORD,,
File without changes