mteb 2.1.17__py3-none-any.whl → 2.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/models/model_implementations/andersborges.py +51 -0
- mteb/models/model_implementations/cohere_models.py +1 -1
- mteb/models/model_implementations/tarka_models.py +1 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +4 -2
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py +1 -1
- mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py +16 -1
- {mteb-2.1.17.dist-info → mteb-2.1.18.dist-info}/METADATA +1 -1
- {mteb-2.1.17.dist-info → mteb-2.1.18.dist-info}/RECORD +12 -11
- {mteb-2.1.17.dist-info → mteb-2.1.18.dist-info}/WHEEL +0 -0
- {mteb-2.1.17.dist-info → mteb-2.1.18.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.17.dist-info → mteb-2.1.18.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.17.dist-info → mteb-2.1.18.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
from mteb.models.model_implementations.model2vec_models import Model2VecModel
|
|
4
|
+
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
5
|
+
|
|
6
|
+
model2vecdk = ModelMeta(
|
|
7
|
+
loader=Model2VecModel, # type: ignore
|
|
8
|
+
name="andersborges/model2vecdk",
|
|
9
|
+
languages=["dan-Latn"],
|
|
10
|
+
open_weights=True,
|
|
11
|
+
revision="cb576c78dcc1b729e4612645f61db59929d69e61",
|
|
12
|
+
release_date="2025-11-21",
|
|
13
|
+
n_parameters=48042496,
|
|
14
|
+
memory_usage_mb=183,
|
|
15
|
+
max_tokens=np.inf,
|
|
16
|
+
embed_dim=256,
|
|
17
|
+
license="mit",
|
|
18
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
19
|
+
framework=["NumPy", "Sentence Transformers"],
|
|
20
|
+
reference="https://huggingface.co/andersborges/model2vecdk",
|
|
21
|
+
use_instructions=False,
|
|
22
|
+
adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
|
|
23
|
+
superseded_by=None,
|
|
24
|
+
training_datasets=set(), # distilled
|
|
25
|
+
public_training_code="https://github.com/andersborges/dkmodel2vec",
|
|
26
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
model2vecdk_stem = ModelMeta(
|
|
31
|
+
loader=Model2VecModel, # type: ignore
|
|
32
|
+
name="andersborges/model2vecdk-stem",
|
|
33
|
+
languages=["dan-Latn"],
|
|
34
|
+
open_weights=True,
|
|
35
|
+
revision="cb576c78dcc1b729e4612645f61db59929d69e61",
|
|
36
|
+
release_date="2025-11-21",
|
|
37
|
+
n_parameters=48578560,
|
|
38
|
+
memory_usage_mb=185,
|
|
39
|
+
max_tokens=np.inf,
|
|
40
|
+
embed_dim=256,
|
|
41
|
+
license="mit",
|
|
42
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
43
|
+
framework=["NumPy", "Sentence Transformers"],
|
|
44
|
+
reference="https://huggingface.co/andersborges/model2vecdk",
|
|
45
|
+
use_instructions=False,
|
|
46
|
+
adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
|
|
47
|
+
superseded_by=None,
|
|
48
|
+
training_datasets=set(), # distilled
|
|
49
|
+
public_training_code="https://github.com/andersborges/dkmodel2vec",
|
|
50
|
+
public_training_data="https://huggingface.co/datasets/DDSC/nordic-embedding-training-data",
|
|
51
|
+
)
|
|
@@ -221,7 +221,7 @@ class CohereTextEmbeddingModel(AbsEncoder):
|
|
|
221
221
|
) -> None:
|
|
222
222
|
import cohere # type: ignore
|
|
223
223
|
|
|
224
|
-
self.model_name = model_name.
|
|
224
|
+
self.model_name = model_name.removeprefix("Cohere/Cohere-")
|
|
225
225
|
self.sep = sep
|
|
226
226
|
self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
|
|
227
227
|
if embedding_type not in get_args(EmbeddingType):
|
|
@@ -342,7 +342,6 @@ tarka_embedding_150m_v1 = ModelMeta(
|
|
|
342
342
|
|
|
343
343
|
tark_embedding_350_v1_kwargs = dict(
|
|
344
344
|
model_kwargs={
|
|
345
|
-
"attn_implementation": "flash_attention_2",
|
|
346
345
|
"torch_dtype": "bfloat16",
|
|
347
346
|
}, # use low-precision
|
|
348
347
|
trust_remote_code=True,
|
|
@@ -357,7 +356,7 @@ tarka_embedding_350m_v1 = ModelMeta(
|
|
|
357
356
|
name="Tarka-AIR/Tarka-Embedding-350M-V1",
|
|
358
357
|
languages=MULTILINGUAL_EVALUATED_LANGUAGES,
|
|
359
358
|
open_weights=True,
|
|
360
|
-
revision="
|
|
359
|
+
revision="a850d6a329145474727424fed6b12b62096b8ba3",
|
|
361
360
|
release_date="2025-11-11",
|
|
362
361
|
n_parameters=354_483_968,
|
|
363
362
|
memory_usage_mb=676,
|
|
@@ -23,7 +23,7 @@ class BUCCBitextMining(AbsTaskBitextMining):
|
|
|
23
23
|
"path": "mteb/BUCC",
|
|
24
24
|
"revision": "414572247440f0ccacf7eb0bb70a31533a0e5443",
|
|
25
25
|
},
|
|
26
|
-
description="BUCC bitext mining dataset",
|
|
26
|
+
description="BUCC bitext mining dataset train split.",
|
|
27
27
|
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
|
|
28
28
|
type="BitextMining",
|
|
29
29
|
category="t2t",
|
|
@@ -71,7 +71,9 @@ Rapp, Reinhard},
|
|
|
71
71
|
|
|
72
72
|
sentence1 = data["sentence1"][0]
|
|
73
73
|
sentence2 = data["sentence2"][0]
|
|
74
|
-
sentence1 = [
|
|
74
|
+
sentence1 = [
|
|
75
|
+
sentence1[i] for (i, j) in gold
|
|
76
|
+
] # keep only sentences in gold. The 2nd value is meant for sentence2 but not used here. This is fixed in BUCC.v2.
|
|
75
77
|
logger.info(f"Lang {lang} num gold {len(gold)}")
|
|
76
78
|
logger.info(f"Lang {lang} num sentence1 {len(sentence1)}")
|
|
77
79
|
logger.info(f"Lang {lang} num sentence2 {len(sentence2)}")
|
|
@@ -20,7 +20,7 @@ class BUCCBitextMiningFast(AbsTaskBitextMining):
|
|
|
20
20
|
"path": "mteb/bucc-bitext-mining",
|
|
21
21
|
"revision": "1739dc11ffe9b7bfccd7f3d585aeb4c544fc6677",
|
|
22
22
|
},
|
|
23
|
-
description="BUCC bitext mining dataset",
|
|
23
|
+
description="BUCC bitext mining dataset train split, gold set only.",
|
|
24
24
|
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
|
|
25
25
|
type="BitextMining",
|
|
26
26
|
category="t2t",
|
|
@@ -26,5 +26,20 @@ class GreenNodeTableMarkdownRetrieval(AbsTaskRetrieval):
|
|
|
26
26
|
annotations_creators="human-annotated",
|
|
27
27
|
dialect=[],
|
|
28
28
|
sample_creation="found",
|
|
29
|
-
bibtex_citation=""
|
|
29
|
+
bibtex_citation=r"""
|
|
30
|
+
@inproceedings{10.1007/978-981-95-1746-6_17,
|
|
31
|
+
abstract = {Information retrieval often comes in plain text, lacking semi-structured text such as HTML and markdown, retrieving data that contains rich format such as table became non-trivial. In this paper, we tackle this challenge by introducing a new dataset, GreenNode Table Retrieval VN (GN-TRVN), which is collected from a massive corpus, a wide range of topics, and a longer context compared to ViQuAD2.0. To evaluate the effectiveness of our proposed dataset, we introduce two versions, M3-GN-VN and M3-GN-VN-Mixed, by fine-tuning the M3-Embedding model on this dataset. Experimental results show that our models consistently outperform the baselines, including the base model, across most evaluation criteria on various datasets such as VieQuADRetrieval, ZacLegalTextRetrieval, and GN-TRVN. In general, we release a more comprehensive dataset and two model versions that improve response performance for Vietnamese Markdown Table Retrieval.},
|
|
32
|
+
address = {Singapore},
|
|
33
|
+
author = {Pham, Bao Loc
|
|
34
|
+
and Hoang, Quoc Viet
|
|
35
|
+
and Luu, Quy Tung
|
|
36
|
+
and Vo, Trong Thu},
|
|
37
|
+
booktitle = {Proceedings of the Fifth International Conference on Intelligent Systems and Networks},
|
|
38
|
+
isbn = {978-981-95-1746-6},
|
|
39
|
+
pages = {153--163},
|
|
40
|
+
publisher = {Springer Nature Singapore},
|
|
41
|
+
title = {GN-TRVN: A Benchmark for Vietnamese Table Markdown Retrieval Task},
|
|
42
|
+
year = {2026},
|
|
43
|
+
}
|
|
44
|
+
""",
|
|
30
45
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.18
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -1445,6 +1445,7 @@ mteb/models/cache_wrappers/cache_backends/numpy_cache.py,sha256=GyTVC5DLph3EeRnD
|
|
|
1445
1445
|
mteb/models/model_implementations/__init__.py,sha256=BZDdde6ajKv-yroy9mqE2YS3Hw1KBdKoxBPg8aPTZEs,1164
|
|
1446
1446
|
mteb/models/model_implementations/align_models.py,sha256=DUdVWxETiwC2IrXI90zQwlvHMjeI7JJCNOmFVd2RNws,4518
|
|
1447
1447
|
mteb/models/model_implementations/amazon_models.py,sha256=pdRU2QGAB5ccQnAfbRSzHE1G3ZUdjvsAgeJwkB_olDQ,694
|
|
1448
|
+
mteb/models/model_implementations/andersborges.py,sha256=QUFpASdcCy-IMz2O2C3OAOhMWA2ksNHM4GFWlkELIT4,1879
|
|
1448
1449
|
mteb/models/model_implementations/ara_models.py,sha256=zS0t9rI21wwEwTlrlX94GqkmPKLnb8ktUaAOY-ZLmw0,1421
|
|
1449
1450
|
mteb/models/model_implementations/arctic_models.py,sha256=eaMRaN9WdpVq1W6cbtNcJMdrJUTXrTSYUjTJufCdZRY,10350
|
|
1450
1451
|
mteb/models/model_implementations/b1ade_models.py,sha256=aEKmXWVX8iJ_OotAYPOMxsOHTDEOJYdSwkR6iJsZ-ms,1609
|
|
@@ -1459,7 +1460,7 @@ mteb/models/model_implementations/cde_models.py,sha256=3nNU3nq3VZZcImFqH1VPj57-Q
|
|
|
1459
1460
|
mteb/models/model_implementations/clip_models.py,sha256=zrfgNmZszu0JMtMNdCMzEohixsrnQ7xFhCqgsiucH_Q,6107
|
|
1460
1461
|
mteb/models/model_implementations/codefuse_models.py,sha256=19Y-d_qetVU64quzEvuUJ_K8DHo1JEEKEGqjRR48dFg,9113
|
|
1461
1462
|
mteb/models/model_implementations/codesage_models.py,sha256=D4CdISGyv5f2GMYq4_efgm5qNq80SWAX5R2u5mjEiXM,2998
|
|
1462
|
-
mteb/models/model_implementations/cohere_models.py,sha256=
|
|
1463
|
+
mteb/models/model_implementations/cohere_models.py,sha256=LiYYRT3clhFlh0RE654KyZtO66vnIO22h79HJLmXYwk,13696
|
|
1463
1464
|
mteb/models/model_implementations/cohere_v.py,sha256=K6VEw1NkyM2PuMd18kHE6aqPrcByYSwEmAKjvLods_w,15760
|
|
1464
1465
|
mteb/models/model_implementations/colpali_models.py,sha256=uVmK3jXO-GDn-7i6cJFWdc0u0-MU3INHHL1rXUPhBec,8944
|
|
1465
1466
|
mteb/models/model_implementations/colqwen_models.py,sha256=Y6IBhYKbxjkC3AePa0l37-F50xcX1mtPCXpgW597HyA,8754
|
|
@@ -1538,7 +1539,7 @@ mteb/models/model_implementations/siglip_models.py,sha256=tvi8QB2ayBoeXsxwHrl5RF
|
|
|
1538
1539
|
mteb/models/model_implementations/sonar_models.py,sha256=Nc6kAJRWSrxA57DPRrgOPHqS1dNhz2vsE_1ZA2JtigQ,4784
|
|
1539
1540
|
mteb/models/model_implementations/spartan8806_atles_champion.py,sha256=9sWQH7tOT0uxXA7sbQcnqGt2f5O9xcw9HqFpRCzoQAA,918
|
|
1540
1541
|
mteb/models/model_implementations/stella_models.py,sha256=NL3tk-rnuBdznsQ-nmelqun4tFO2xKoNPPOOVKqnPGU,8062
|
|
1541
|
-
mteb/models/model_implementations/tarka_models.py,sha256=
|
|
1542
|
+
mteb/models/model_implementations/tarka_models.py,sha256=UwSb3e-k7dCgQAJv3176ZvKpkjLZfpdPzwf-b0Oxuuo,27345
|
|
1542
1543
|
mteb/models/model_implementations/text2vec_models.py,sha256=zaHWRc2W0RYZAOetinqRzug9UGW0HmY5U-jYsLXA8wo,4160
|
|
1543
1544
|
mteb/models/model_implementations/ua_sentence_models.py,sha256=fcvXR4-Rrt-UDTlDkh2ZAO1gO_ufCOHiT6EhoeKiHx8,1224
|
|
1544
1545
|
mteb/models/model_implementations/uae_models.py,sha256=KZxH5a3t-sfh33xUBkLizEuyFAyPlGfnRsn-S7mjq74,3112
|
|
@@ -1581,8 +1582,8 @@ mteb/tasks/bitext_mining/kat/__init__.py,sha256=a-KcFJ3Ol7R8yq02RcGjaOxEfqnwJeo7
|
|
|
1581
1582
|
mteb/tasks/bitext_mining/kat/tbilisi_city_hall_bitext_mining.py,sha256=xVCxpJr7UW2KadNdn7Gsw-wZ65uz5vhRDhQZ7eILokQ,1918
|
|
1582
1583
|
mteb/tasks/bitext_mining/multilingual/__init__.py,sha256=qDgixbHEvV3xz6JN3kFQDnvtSL-yVjl-Z8inUwimI6I,1954
|
|
1583
1584
|
mteb/tasks/bitext_mining/multilingual/bible_nlp_bitext_mining.py,sha256=lNbCz3dN9o3F04Y7vtNBhF-lPUNyVbAOKgUR-QKZn_8,29082
|
|
1584
|
-
mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py,sha256=
|
|
1585
|
-
mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py,sha256=
|
|
1585
|
+
mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py,sha256=tTKvS-v7d9V_zymCn_ZonUKlo9NI7vTyppxS9iAu8I0,2873
|
|
1586
|
+
mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining_fast.py,sha256=P_UHMWh2gKG6CloXmP5J2kjrCTQwoJAU1MKdLl6JFKc,1836
|
|
1586
1587
|
mteb/tasks/bitext_mining/multilingual/danish_medicines_agency_bitext_mining.py,sha256=5iengckKv1NCHILjrX6WDEgBNJlmbSV5y-WWdaLZYrs,1703
|
|
1587
1588
|
mteb/tasks/bitext_mining/multilingual/diabla_bitext_mining.py,sha256=Ua6DfpJYgoaIRSWB284WNj6wQWqxPiC3kUPcmKNGDWQ,1498
|
|
1588
1589
|
mteb/tasks/bitext_mining/multilingual/flores_bitext_mining.py,sha256=Dh43H2NWZIAzN9Zeib3TtKhUcK1jldUQX-GvTsm1MnI,5616
|
|
@@ -2441,7 +2442,7 @@ mteb/tasks/retrieval/vie/cqa_dupstack_wordpress_vn_retrieval.py,sha256=ppFPam-3A
|
|
|
2441
2442
|
mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py,sha256=hOiwz2bcayDW6VrCvsIGeYh1TT7koByM76rZZwtp9KA,1754
|
|
2442
2443
|
mteb/tasks/retrieval/vie/fevervn_retrieval.py,sha256=xLGoXefGk1l1AFiOSf2Ja0fM_rAQp4tdaR8H6jJqYlI,1853
|
|
2443
2444
|
mteb/tasks/retrieval/vie/fi_qa2018_vn_retrieval.py,sha256=FGfFuLzRCTuupRxZdjVbBiwCOSspb3vwvtNAKvyXjso,1714
|
|
2444
|
-
mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py,sha256=
|
|
2445
|
+
mteb/tasks/retrieval/vie/green_node_table_markdown_retrieval.py,sha256=O7iIcuvqhrHjB7J1VxH9YJ3v6cuFFBQdrrnYwLgeRfE,2429
|
|
2445
2446
|
mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py,sha256=FYWj8EhnfwDuPRxZ8uTeGkfa2Q-jDU2bliTmp975Coc,1837
|
|
2446
2447
|
mteb/tasks/retrieval/vie/msmarcovn_retrieval.py,sha256=xtJ1-rjx4slwSR8p6NedqItTk-79ZzT2f9FlDOhbzkE,1958
|
|
2447
2448
|
mteb/tasks/retrieval/vie/nf_corpus_vn_retrieval.py,sha256=4S8IDJ-TVjKEy2teM8GOeDzHIZR8txkPvX0sGDYIyqs,1780
|
|
@@ -2555,9 +2556,9 @@ mteb/types/_metadata.py,sha256=NN-W0S6a5TDV7UkpRx1pyWtGF4TyyCyoPUfHOwdeci8,2290
|
|
|
2555
2556
|
mteb/types/_result.py,sha256=CRAUc5IvqI3_9SyXDwv-PWLCXwXdZem9RePeYESRtuw,996
|
|
2556
2557
|
mteb/types/_string_validators.py,sha256=PY-dYq4E8O50VS3bLYdldPWp400fl_WzUjfVSkNWe8U,523
|
|
2557
2558
|
mteb/types/statistics.py,sha256=YwJsxTf1eaCI_RE-J37a-gK5wDeGAsmkeZKoZCFihSo,3755
|
|
2558
|
-
mteb-2.1.
|
|
2559
|
-
mteb-2.1.
|
|
2560
|
-
mteb-2.1.
|
|
2561
|
-
mteb-2.1.
|
|
2562
|
-
mteb-2.1.
|
|
2563
|
-
mteb-2.1.
|
|
2559
|
+
mteb-2.1.18.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2560
|
+
mteb-2.1.18.dist-info/METADATA,sha256=OF7o0Df2GbEQuuNXdXxByJR0atmGv4XdRrwMSWuOcx0,13574
|
|
2561
|
+
mteb-2.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2562
|
+
mteb-2.1.18.dist-info/entry_points.txt,sha256=8IJoEJFKoDHmVnNev-qJ9pp4Ln7_1-ma9QsXnzVCzGU,39
|
|
2563
|
+
mteb-2.1.18.dist-info/top_level.txt,sha256=OLVIjcQAlWBz0bdmutKlWHLF42FF0hp4uVAg3ZyiG4U,5
|
|
2564
|
+
mteb-2.1.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|