mteb 2.0.4__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +10 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +75 -0
- mteb/descriptive_stats/BitextMining/BUCC.json +70 -40
- mteb/descriptive_stats/Classification/DKHateClassification.json +40 -24
- mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
- mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
- mteb/descriptive_stats/Classification/FinancialPhrasebankClassification.json +23 -15
- mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
- mteb/descriptive_stats/Classification/ImdbClassification.json +40 -24
- mteb/descriptive_stats/Classification/KorHateClassification.json +23 -15
- mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
- mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
- mteb/descriptive_stats/Clustering/ArxivClusteringP2P.json +555 -550
- mteb/descriptive_stats/Clustering/ArxivClusteringP2P.v2.json +546 -541
- mteb/descriptive_stats/Clustering/ArxivClusteringS2S.json +555 -550
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
- mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
- mteb/descriptive_stats/Clustering/MLSUMClusteringP2P.json +2466 -2416
- mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
- mteb/descriptive_stats/Clustering/RedditClusteringP2P.json +1365 -1360
- mteb/descriptive_stats/Clustering/SNLClustering.json +378 -373
- mteb/descriptive_stats/Clustering/SwednClustering.json +28 -23
- mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
- mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
- mteb/descriptive_stats/Clustering/VGClustering.json +54 -49
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/WITT2IRetrieval.json +324 -204
- mteb/descriptive_stats/Image/Any2AnyRetrieval/MemotionI2TRetrieval.json +28 -18
- mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRAirbnbSyntheticRetrieval.json +334 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRGitHubReadmeRetrieval.json +544 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRTweetStockSyntheticsRetrieval.json +334 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRWikimediaCommonsDocumentsRetrieval.json +634 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore2ESGReportsRetrieval.json +154 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore2EconomicsReportsRetrieval.json +154 -0
- mteb/descriptive_stats/Image/ImageClassification/Imagenet1k.json +6039 -3007
- mteb/descriptive_stats/Image/ZeroShotClassification/Imagenet1kZeroShot.json +3024 -3010
- mteb/descriptive_stats/Image/ZeroShotClassification/PatchCamelyonZeroShot.json +30 -16
- mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
- mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
- mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
- mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
- mteb/descriptive_stats/Reranking/MIRACLReranking.json +555 -479
- mteb/descriptive_stats/Reranking/MindSmallReranking.json +29 -25
- mteb/descriptive_stats/Retrieval/AlloprofRetrieval.json +25 -26
- mteb/descriptive_stats/Retrieval/Code1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/DanFEVER.json +25 -26
- mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EnglishFinance1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EnglishFinance2Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EnglishFinance3Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EnglishFinance4Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EnglishHealthcare1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/French1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/FrenchLegal1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/German1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/GermanHealthcare1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/GermanLegal1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/JapaneseCode1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/JapaneseLegal1Retrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLRetrieval.json +475 -494
- mteb/descriptive_stats/Retrieval/MSMARCO-Fa.json +25 -26
- mteb/descriptive_stats/Retrieval/MSMARCO.json +25 -84
- mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/Touche2020.json +25 -26
- mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
- mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
- mteb/descriptive_stats/Summarization/SummEval.json +27 -50
- mteb/descriptive_stats/Summarization/SummEvalFr.json +27 -50
- mteb/models/model_implementations/kalm_models.py +29 -0
- mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
- mteb/tasks/classification/eng/financial_phrasebank_classification.py +0 -3
- mteb/tasks/classification/kor/kor_hate_classification.py +0 -12
- mteb/tasks/classification/nld/__init__.py +16 -0
- mteb/tasks/classification/nld/dutch_cola_classification.py +38 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +37 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +30 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +36 -0
- mteb/tasks/classification/nld/iconclass_classification.py +41 -0
- mteb/tasks/classification/nld/open_tender_classification.py +38 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +46 -0
- mteb/tasks/clustering/__init__.py +1 -0
- mteb/tasks/clustering/nld/__init__.py +17 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +37 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +37 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +47 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +41 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +51 -0
- mteb/tasks/clustering/swe/swedn_clustering.py +2 -2
- mteb/tasks/multilabel_classification/__init__.py +1 -0
- mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +88 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +44 -0
- mteb/tasks/pair_classification/__init__.py +1 -0
- mteb/tasks/pair_classification/nld/__init__.py +7 -0
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +36 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +41 -0
- mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
- mteb/tasks/retrieval/nld/__init__.py +10 -0
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +41 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +30 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +39 -0
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +38 -0
- mteb/tasks/retrieval/nld/vabb_retrieval.py +41 -0
- mteb/tasks/sts/__init__.py +1 -0
- mteb/tasks/sts/nld/__init__.py +5 -0
- mteb/tasks/sts/nld/sick_nl_sts.py +41 -0
- {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/METADATA +2 -204
- {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/RECORD +120 -49
- {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/WHEEL +0 -0
- {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/entry_points.txt +0 -0
- {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,31 +1,30 @@
|
|
|
1
1
|
{
|
|
2
2
|
"dev": {
|
|
3
3
|
"num_samples": 8848803,
|
|
4
|
-
"number_of_characters":
|
|
5
|
-
"
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
"
|
|
29
|
-
"max_top_ranked_per_query": null
|
|
4
|
+
"number_of_characters": 2707180622,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 2706977894,
|
|
7
|
+
"min_text_length": 0,
|
|
8
|
+
"average_text_length": 306.15608274447476,
|
|
9
|
+
"max_text_length": 1617,
|
|
10
|
+
"unique_texts": 8827413
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 202728,
|
|
15
|
+
"min_text_length": 7,
|
|
16
|
+
"average_text_length": 29.044126074498568,
|
|
17
|
+
"max_text_length": 158,
|
|
18
|
+
"unique_texts": 6978
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 7437,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0654727793696275,
|
|
25
|
+
"max_relevant_docs_per_query": 4,
|
|
26
|
+
"unique_relevant_docs": 7433
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
30
29
|
}
|
|
31
30
|
}
|
|
@@ -1,89 +1,30 @@
|
|
|
1
1
|
{
|
|
2
|
-
"train": {
|
|
3
|
-
"num_samples": 9344762,
|
|
4
|
-
"number_of_characters": 2994608051,
|
|
5
|
-
"num_documents": 8841823,
|
|
6
|
-
"min_document_length": 4,
|
|
7
|
-
"average_document_length": 336.79716603691344,
|
|
8
|
-
"max_document_length": 1670,
|
|
9
|
-
"unique_documents": 8841823,
|
|
10
|
-
"num_queries": 502939,
|
|
11
|
-
"min_query_length": 5,
|
|
12
|
-
"average_query_length": 33.21898281898998,
|
|
13
|
-
"max_query_length": 215,
|
|
14
|
-
"unique_queries": 502939,
|
|
15
|
-
"none_queries": 0,
|
|
16
|
-
"num_relevant_docs": 532751,
|
|
17
|
-
"min_relevant_docs_per_query": 1,
|
|
18
|
-
"average_relevant_docs_per_query": 1.0592755781516248,
|
|
19
|
-
"max_relevant_docs_per_query": 7,
|
|
20
|
-
"unique_relevant_docs": 516472,
|
|
21
|
-
"num_instructions": null,
|
|
22
|
-
"min_instruction_length": null,
|
|
23
|
-
"average_instruction_length": null,
|
|
24
|
-
"max_instruction_length": null,
|
|
25
|
-
"unique_instructions": null,
|
|
26
|
-
"num_top_ranked": null,
|
|
27
|
-
"min_top_ranked_per_query": null,
|
|
28
|
-
"average_top_ranked_per_query": null,
|
|
29
|
-
"max_top_ranked_per_query": null
|
|
30
|
-
},
|
|
31
2
|
"dev": {
|
|
32
3
|
"num_samples": 8848803,
|
|
33
|
-
"number_of_characters":
|
|
34
|
-
"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
"
|
|
58
|
-
"max_top_ranked_per_query": null
|
|
59
|
-
},
|
|
60
|
-
"test": {
|
|
61
|
-
"num_samples": 8841866,
|
|
62
|
-
"number_of_characters": 2977902337,
|
|
63
|
-
"num_documents": 8841823,
|
|
64
|
-
"min_document_length": 4,
|
|
65
|
-
"average_document_length": 336.79716603691344,
|
|
66
|
-
"max_document_length": 1670,
|
|
67
|
-
"unique_documents": 8841823,
|
|
68
|
-
"num_queries": 43,
|
|
69
|
-
"min_query_length": 16,
|
|
70
|
-
"average_query_length": 32.74418604651163,
|
|
71
|
-
"max_query_length": 55,
|
|
72
|
-
"unique_queries": 43,
|
|
73
|
-
"none_queries": 0,
|
|
74
|
-
"num_relevant_docs": 9260,
|
|
75
|
-
"min_relevant_docs_per_query": 132,
|
|
76
|
-
"average_relevant_docs_per_query": 95.3953488372093,
|
|
77
|
-
"max_relevant_docs_per_query": 582,
|
|
78
|
-
"unique_relevant_docs": 9139,
|
|
79
|
-
"num_instructions": null,
|
|
80
|
-
"min_instruction_length": null,
|
|
81
|
-
"average_instruction_length": null,
|
|
82
|
-
"max_instruction_length": null,
|
|
83
|
-
"unique_instructions": null,
|
|
84
|
-
"num_top_ranked": null,
|
|
85
|
-
"min_top_ranked_per_query": null,
|
|
86
|
-
"average_top_ranked_per_query": null,
|
|
87
|
-
"max_top_ranked_per_query": null
|
|
4
|
+
"number_of_characters": 2969291276,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 2969059106,
|
|
7
|
+
"min_text_length": 3,
|
|
8
|
+
"average_text_length": 335.79716603691344,
|
|
9
|
+
"max_text_length": 1669,
|
|
10
|
+
"unique_texts": 8841661
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 232170,
|
|
15
|
+
"min_text_length": 9,
|
|
16
|
+
"average_text_length": 33.2621776504298,
|
|
17
|
+
"max_text_length": 186,
|
|
18
|
+
"unique_texts": 6980
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 7437,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0654727793696275,
|
|
25
|
+
"max_relevant_docs_per_query": 4,
|
|
26
|
+
"unique_relevant_docs": 7433
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
88
29
|
}
|
|
89
30
|
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 138633,
|
|
4
|
+
"number_of_characters": 59639635,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 59576581,
|
|
7
|
+
"min_text_length": 2,
|
|
8
|
+
"average_text_length": 432.86552643624714,
|
|
9
|
+
"max_text_length": 16782,
|
|
10
|
+
"unique_texts": 122413
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 63054,
|
|
15
|
+
"min_text_length": 9,
|
|
16
|
+
"average_text_length": 63.054,
|
|
17
|
+
"max_text_length": 286,
|
|
18
|
+
"unique_texts": 992
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 1000,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0,
|
|
25
|
+
"max_relevant_docs_per_query": 1,
|
|
26
|
+
"unique_relevant_docs": 1000
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -1,31 +1,30 @@
|
|
|
1
1
|
{
|
|
2
2
|
"test": {
|
|
3
3
|
"num_samples": 382594,
|
|
4
|
-
"number_of_characters":
|
|
5
|
-
"
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
"
|
|
21
|
-
"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
"
|
|
29
|
-
"max_top_ranked_per_query": null
|
|
4
|
+
"number_of_characters": 658104319,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 658102191,
|
|
7
|
+
"min_text_length": 3,
|
|
8
|
+
"average_text_length": 1720.326212602439,
|
|
9
|
+
"max_text_length": 106072,
|
|
10
|
+
"unique_texts": 379559
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 2128,
|
|
15
|
+
"min_text_length": 16,
|
|
16
|
+
"average_text_length": 43.42857142857143,
|
|
17
|
+
"max_text_length": 83,
|
|
18
|
+
"unique_texts": 49
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 932,
|
|
23
|
+
"min_relevant_docs_per_query": 40,
|
|
24
|
+
"average_relevant_docs_per_query": 19.020408163265305,
|
|
25
|
+
"max_relevant_docs_per_query": 52,
|
|
26
|
+
"unique_relevant_docs": 2099
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
30
29
|
}
|
|
31
30
|
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 10318,
|
|
4
|
+
"number_of_characters": 7839416,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 7765564,
|
|
7
|
+
"min_text_length": 9,
|
|
8
|
+
"average_text_length": 833.393861343636,
|
|
9
|
+
"max_text_length": 35146,
|
|
10
|
+
"unique_texts": 9123
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 73852,
|
|
15
|
+
"min_text_length": 7,
|
|
16
|
+
"average_text_length": 73.852,
|
|
17
|
+
"max_text_length": 258,
|
|
18
|
+
"unique_texts": 999
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 1000,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0,
|
|
25
|
+
"max_relevant_docs_per_query": 1,
|
|
26
|
+
"unique_relevant_docs": 1000
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 22637,
|
|
4
|
+
"number_of_characters": 21218611,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 21197901,
|
|
7
|
+
"min_text_length": 7,
|
|
8
|
+
"average_text_length": 945.7015837608744,
|
|
9
|
+
"max_text_length": 37834,
|
|
10
|
+
"unique_texts": 22415
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 20710,
|
|
15
|
+
"min_text_length": 22,
|
|
16
|
+
"average_text_length": 93.28828828828829,
|
|
17
|
+
"max_text_length": 250,
|
|
18
|
+
"unique_texts": 222
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 1059,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 4.77027027027027,
|
|
25
|
+
"max_relevant_docs_per_query": 57,
|
|
26
|
+
"unique_relevant_docs": 491
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 4902,
|
|
4
|
+
"number_of_characters": 463327,
|
|
5
|
+
"unique_pairs": 4902,
|
|
6
|
+
"text1_statistics": {
|
|
7
|
+
"total_text_length": 233941,
|
|
8
|
+
"min_text_length": 10,
|
|
9
|
+
"average_text_length": 47.72358221134231,
|
|
10
|
+
"max_text_length": 158,
|
|
11
|
+
"unique_texts": 3378
|
|
12
|
+
},
|
|
13
|
+
"text2_statistics": {
|
|
14
|
+
"total_text_length": 229386,
|
|
15
|
+
"min_text_length": 10,
|
|
16
|
+
"average_text_length": 46.79436964504284,
|
|
17
|
+
"max_text_length": 158,
|
|
18
|
+
"unique_texts": 3327
|
|
19
|
+
},
|
|
20
|
+
"image1_statistics": null,
|
|
21
|
+
"image2_statistics": null,
|
|
22
|
+
"label_statistics": {
|
|
23
|
+
"min_score": 1.0,
|
|
24
|
+
"avg_score": 3.528012039368932,
|
|
25
|
+
"max_score": 5.0
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
@@ -1,55 +1,32 @@
|
|
|
1
1
|
{
|
|
2
2
|
"test": {
|
|
3
3
|
"num_samples": 100,
|
|
4
|
-
"number_of_characters":
|
|
5
|
-
"
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
1.0,
|
|
32
|
-
2.0,
|
|
33
|
-
1.0
|
|
34
|
-
],
|
|
35
|
-
"avg_relevance": 3.7770833333333336,
|
|
36
|
-
"max_relevance": [
|
|
37
|
-
5.0,
|
|
38
|
-
4.666666666666667,
|
|
39
|
-
4.333333333333333,
|
|
40
|
-
2.6666666666666665,
|
|
41
|
-
4.666666666666667,
|
|
42
|
-
4.666666666666667,
|
|
43
|
-
4.666666666666667,
|
|
44
|
-
4.333333333333333,
|
|
45
|
-
4.0,
|
|
46
|
-
4.333333333333333,
|
|
47
|
-
4.666666666666667,
|
|
48
|
-
4.666666666666667,
|
|
49
|
-
4.333333333333333,
|
|
50
|
-
2.3333333333333335,
|
|
51
|
-
4.666666666666667,
|
|
52
|
-
4.666666666666667
|
|
53
|
-
]
|
|
4
|
+
"number_of_characters": 1007527,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 210035,
|
|
7
|
+
"min_text_length": 626,
|
|
8
|
+
"average_text_length": 2100.35,
|
|
9
|
+
"max_text_length": 3153,
|
|
10
|
+
"unique_texts": 100
|
|
11
|
+
},
|
|
12
|
+
"human_summaries_statistics": {
|
|
13
|
+
"total_text_length": 248982,
|
|
14
|
+
"min_text_length": 86,
|
|
15
|
+
"average_text_length": 226.34727272727272,
|
|
16
|
+
"max_text_length": 717,
|
|
17
|
+
"unique_texts": 1100
|
|
18
|
+
},
|
|
19
|
+
"machine_summaries_statistics": {
|
|
20
|
+
"total_text_length": 548510,
|
|
21
|
+
"min_text_length": 35,
|
|
22
|
+
"average_text_length": 342.81875,
|
|
23
|
+
"max_text_length": 718,
|
|
24
|
+
"unique_texts": 1548
|
|
25
|
+
},
|
|
26
|
+
"score_statistics": {
|
|
27
|
+
"min_score": 1.0,
|
|
28
|
+
"avg_score": 3.777083333333336,
|
|
29
|
+
"max_score": 5.0
|
|
30
|
+
}
|
|
54
31
|
}
|
|
55
32
|
}
|
|
@@ -1,55 +1,32 @@
|
|
|
1
1
|
{
|
|
2
2
|
"test": {
|
|
3
3
|
"num_samples": 100,
|
|
4
|
-
"number_of_characters":
|
|
5
|
-
"
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
1.0,
|
|
32
|
-
2.0,
|
|
33
|
-
1.0
|
|
34
|
-
],
|
|
35
|
-
"avg_relevance": 3.7770833333333336,
|
|
36
|
-
"max_relevance": [
|
|
37
|
-
5.0,
|
|
38
|
-
4.666666666666667,
|
|
39
|
-
4.333333333333333,
|
|
40
|
-
2.666666666666666,
|
|
41
|
-
4.666666666666667,
|
|
42
|
-
4.666666666666667,
|
|
43
|
-
4.666666666666667,
|
|
44
|
-
4.333333333333333,
|
|
45
|
-
4.0,
|
|
46
|
-
4.333333333333333,
|
|
47
|
-
4.666666666666667,
|
|
48
|
-
4.666666666666667,
|
|
49
|
-
4.333333333333333,
|
|
50
|
-
2.333333333333333,
|
|
51
|
-
4.666666666666667,
|
|
52
|
-
4.666666666666667
|
|
53
|
-
]
|
|
4
|
+
"number_of_characters": 1139767,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 240173,
|
|
7
|
+
"min_text_length": 668,
|
|
8
|
+
"average_text_length": 2401.73,
|
|
9
|
+
"max_text_length": 3699,
|
|
10
|
+
"unique_texts": 100
|
|
11
|
+
},
|
|
12
|
+
"human_summaries_statistics": {
|
|
13
|
+
"total_text_length": 284479,
|
|
14
|
+
"min_text_length": 76,
|
|
15
|
+
"average_text_length": 258.61727272727273,
|
|
16
|
+
"max_text_length": 815,
|
|
17
|
+
"unique_texts": 1100
|
|
18
|
+
},
|
|
19
|
+
"machine_summaries_statistics": {
|
|
20
|
+
"total_text_length": 615115,
|
|
21
|
+
"min_text_length": 0,
|
|
22
|
+
"average_text_length": 384.446875,
|
|
23
|
+
"max_text_length": 1079,
|
|
24
|
+
"unique_texts": 1540
|
|
25
|
+
},
|
|
26
|
+
"score_statistics": {
|
|
27
|
+
"min_score": 1.0,
|
|
28
|
+
"avg_score": 3.777083333333336,
|
|
29
|
+
"max_score": 5.0
|
|
30
|
+
}
|
|
54
31
|
}
|
|
55
32
|
}
|
|
@@ -766,3 +766,32 @@ HIT_TMG__KaLM_embedding_multilingual_mini_instruct_v2 = ModelMeta(
|
|
|
766
766
|
superseded_by=None,
|
|
767
767
|
citation=KALM_EMBEDDING_CITATION,
|
|
768
768
|
)
|
|
769
|
+
|
|
770
|
+
KaLM_Embedding_KaLM_embedding_multilingual_mini_instruct_v2_5 = ModelMeta(
|
|
771
|
+
loader=InstructSentenceTransformerModel,
|
|
772
|
+
loader_kwargs=dict(
|
|
773
|
+
instruction_template=KaLM_INSTRUCTION,
|
|
774
|
+
max_seq_length=512,
|
|
775
|
+
apply_instruction_to_passages=False,
|
|
776
|
+
prompts_dict=KaLM_v2_task_prompts,
|
|
777
|
+
),
|
|
778
|
+
name="KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5",
|
|
779
|
+
revision="6a4cfc1084cb459ebd4729b53a8656a61448c720",
|
|
780
|
+
release_date="2025-09-30",
|
|
781
|
+
languages=["eng-Latn", "zho-Hans"],
|
|
782
|
+
n_parameters=494032768,
|
|
783
|
+
memory_usage_mb=1885,
|
|
784
|
+
max_tokens=512,
|
|
785
|
+
embed_dim=896,
|
|
786
|
+
license="apache-2.0",
|
|
787
|
+
open_weights=True,
|
|
788
|
+
public_training_code=None,
|
|
789
|
+
public_training_data="https://huggingface.co/datasets/KaLM-Embedding/KaLM-embedding-finetuning-data",
|
|
790
|
+
framework=["PyTorch", "Sentence Transformers"],
|
|
791
|
+
reference="https://huggingface.co/KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5",
|
|
792
|
+
similarity_fn_name="cosine",
|
|
793
|
+
use_instructions=True,
|
|
794
|
+
training_datasets=kalm_v2_training_data,
|
|
795
|
+
adapted_from="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v2",
|
|
796
|
+
superseded_by=None,
|
|
797
|
+
)
|
|
@@ -21,7 +21,7 @@ class BUCCBitextMining(AbsTaskBitextMining):
|
|
|
21
21
|
name="BUCC",
|
|
22
22
|
dataset={
|
|
23
23
|
"path": "mteb/BUCC",
|
|
24
|
-
"revision": "
|
|
24
|
+
"revision": "414572247440f0ccacf7eb0bb70a31533a0e5443",
|
|
25
25
|
},
|
|
26
26
|
description="BUCC bitext mining dataset",
|
|
27
27
|
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
|
|
@@ -36,9 +36,6 @@ class FinancialPhrasebankClassification(AbsTaskClassification):
|
|
|
36
36
|
superseded_by="FinancialPhrasebankClassification.v2",
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
-
def dataset_transform(self):
|
|
40
|
-
self.dataset = self.dataset.rename_column("sentence", "text")
|
|
41
|
-
|
|
42
39
|
|
|
43
40
|
class FinancialPhrasebankClassificationV2(AbsTaskClassification):
|
|
44
41
|
metadata = TaskMetadata(
|
|
@@ -44,18 +44,6 @@ class KorHateClassification(AbsTaskClassification):
|
|
|
44
44
|
superseded_by="KorHateClassification.v2",
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
-
def dataset_transform(self):
|
|
48
|
-
keep_cols = ["comments", "hate"]
|
|
49
|
-
rename_dict = dict(zip(keep_cols, ["text", "label"]))
|
|
50
|
-
remove_cols = [
|
|
51
|
-
col for col in self.dataset["test"].column_names if col not in keep_cols
|
|
52
|
-
]
|
|
53
|
-
self.dataset = self.dataset.rename_columns(rename_dict)
|
|
54
|
-
self.dataset = self.dataset.remove_columns(remove_cols)
|
|
55
|
-
self.dataset = self.stratified_subsampling(
|
|
56
|
-
self.dataset, seed=self.seed, splits=["train"]
|
|
57
|
-
)
|
|
58
|
-
|
|
59
47
|
|
|
60
48
|
class KorHateClassificationV2(AbsTaskClassification):
|
|
61
49
|
metadata = TaskMetadata(
|
|
@@ -2,8 +2,24 @@ from .dutch_book_review_sentiment_classification import (
|
|
|
2
2
|
DutchBookReviewSentimentClassification,
|
|
3
3
|
DutchBookReviewSentimentClassificationV2,
|
|
4
4
|
)
|
|
5
|
+
from .dutch_cola_classification import DutchColaClassification
|
|
6
|
+
from .dutch_government_bias_classification import DutchGovernmentBiasClassification
|
|
7
|
+
from .dutch_news_articles_classification import DutchNewsArticlesClassification
|
|
8
|
+
from .dutch_sarcastic_headlines_classification import (
|
|
9
|
+
DutchSarcasticHeadlinesClassification,
|
|
10
|
+
)
|
|
11
|
+
from .iconclass_classification import IconclassClassification
|
|
12
|
+
from .open_tender_classification import OpenTenderClassification
|
|
13
|
+
from .vaccin_chat_nl_classification import VaccinChatNLClassification
|
|
5
14
|
|
|
6
15
|
__all__ = [
|
|
7
16
|
"DutchBookReviewSentimentClassification",
|
|
8
17
|
"DutchBookReviewSentimentClassificationV2",
|
|
18
|
+
"DutchColaClassification",
|
|
19
|
+
"DutchGovernmentBiasClassification",
|
|
20
|
+
"DutchNewsArticlesClassification",
|
|
21
|
+
"DutchSarcasticHeadlinesClassification",
|
|
22
|
+
"IconclassClassification",
|
|
23
|
+
"OpenTenderClassification",
|
|
24
|
+
"VaccinChatNLClassification",
|
|
9
25
|
]
|