mteb 2.0.5__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +10 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +75 -0
- mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
- mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
- mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
- mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
- mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
- mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
- mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
- mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
- mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
- mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
- mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
- mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
- mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
- mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
- mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
- mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
- mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
- mteb/tasks/classification/nld/__init__.py +16 -0
- mteb/tasks/classification/nld/dutch_cola_classification.py +38 -0
- mteb/tasks/classification/nld/dutch_government_bias_classification.py +37 -0
- mteb/tasks/classification/nld/dutch_news_articles_classification.py +30 -0
- mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +36 -0
- mteb/tasks/classification/nld/iconclass_classification.py +41 -0
- mteb/tasks/classification/nld/open_tender_classification.py +38 -0
- mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +46 -0
- mteb/tasks/clustering/__init__.py +1 -0
- mteb/tasks/clustering/nld/__init__.py +17 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +37 -0
- mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +37 -0
- mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +47 -0
- mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +41 -0
- mteb/tasks/clustering/nld/vabb_clustering_p2p.py +51 -0
- mteb/tasks/clustering/nld/vabb_clustering_s2s.py +51 -0
- mteb/tasks/multilabel_classification/__init__.py +1 -0
- mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
- mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +88 -0
- mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +44 -0
- mteb/tasks/pair_classification/__init__.py +1 -0
- mteb/tasks/pair_classification/nld/__init__.py +7 -0
- mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +36 -0
- mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +41 -0
- mteb/tasks/retrieval/nld/__init__.py +10 -0
- mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +41 -0
- mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +30 -0
- mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +39 -0
- mteb/tasks/retrieval/nld/open_tender_retrieval.py +38 -0
- mteb/tasks/retrieval/nld/vabb_retrieval.py +41 -0
- mteb/tasks/sts/__init__.py +1 -0
- mteb/tasks/sts/nld/__init__.py +5 -0
- mteb/tasks/sts/nld/sick_nl_sts.py +41 -0
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/METADATA +2 -204
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/RECORD +67 -15
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/WHEEL +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/entry_points.txt +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.0.5.dist-info → mteb-2.1.0.dist-info}/top_level.txt +0 -0
mteb/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from importlib.metadata import version
|
|
2
2
|
|
|
3
|
+
from mteb import types
|
|
3
4
|
from mteb.abstasks import AbsTask
|
|
4
5
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
5
6
|
from mteb.deprecated_evaluator import MTEB
|
|
@@ -7,7 +8,12 @@ from mteb.evaluate import evaluate
|
|
|
7
8
|
from mteb.filter_tasks import filter_tasks
|
|
8
9
|
from mteb.get_tasks import get_task, get_tasks
|
|
9
10
|
from mteb.load_results import load_results
|
|
10
|
-
from mteb.models import
|
|
11
|
+
from mteb.models import (
|
|
12
|
+
CrossEncoderProtocol,
|
|
13
|
+
EncoderProtocol,
|
|
14
|
+
SearchProtocol,
|
|
15
|
+
SentenceTransformerEncoderWrapper,
|
|
16
|
+
)
|
|
11
17
|
from mteb.models.get_model_meta import get_model, get_model_meta, get_model_metas
|
|
12
18
|
from mteb.results import BenchmarkResults, TaskResult
|
|
13
19
|
|
|
@@ -21,7 +27,9 @@ __all__ = [
|
|
|
21
27
|
"AbsTask",
|
|
22
28
|
"Benchmark",
|
|
23
29
|
"BenchmarkResults",
|
|
30
|
+
"CrossEncoderProtocol",
|
|
24
31
|
"EncoderProtocol",
|
|
32
|
+
"SearchProtocol",
|
|
25
33
|
"SentenceTransformerEncoderWrapper",
|
|
26
34
|
"TaskMetadata",
|
|
27
35
|
"TaskResult",
|
|
@@ -35,4 +43,5 @@ __all__ = [
|
|
|
35
43
|
"get_task",
|
|
36
44
|
"get_tasks",
|
|
37
45
|
"load_results",
|
|
46
|
+
"types",
|
|
38
47
|
]
|
|
@@ -27,6 +27,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
27
27
|
MTEB_KOR,
|
|
28
28
|
MTEB_MAIN_RU,
|
|
29
29
|
MTEB_MINERS_BITEXT_MINING,
|
|
30
|
+
MTEB_NL,
|
|
30
31
|
MTEB_POL,
|
|
31
32
|
MTEB_RETRIEVAL_LAW,
|
|
32
33
|
MTEB_RETRIEVAL_MEDICAL,
|
|
@@ -87,6 +88,7 @@ __all__ = [
|
|
|
87
88
|
"MTEB_KOR",
|
|
88
89
|
"MTEB_MAIN_RU",
|
|
89
90
|
"MTEB_MINERS_BITEXT_MINING",
|
|
91
|
+
"MTEB_NL",
|
|
90
92
|
"MTEB_POL",
|
|
91
93
|
"MTEB_RETRIEVAL_LAW",
|
|
92
94
|
"MTEB_RETRIEVAL_MEDICAL",
|
|
@@ -1636,6 +1636,81 @@ BEIR_NL = Benchmark(
|
|
|
1636
1636
|
""",
|
|
1637
1637
|
)
|
|
1638
1638
|
|
|
1639
|
+
MTEB_NL = Benchmark(
|
|
1640
|
+
name="MTEB(nld, v1)",
|
|
1641
|
+
display_name="MTEB-NL",
|
|
1642
|
+
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
|
|
1643
|
+
tasks=MTEBTasks(
|
|
1644
|
+
get_tasks(
|
|
1645
|
+
languages=["nld"],
|
|
1646
|
+
exclusive_language_filter=True,
|
|
1647
|
+
tasks=[
|
|
1648
|
+
# Classification
|
|
1649
|
+
"DutchBookReviewSentimentClassification",
|
|
1650
|
+
"MassiveIntentClassification",
|
|
1651
|
+
"MassiveScenarioClassification",
|
|
1652
|
+
"SIB200Classification",
|
|
1653
|
+
"MultiHateClassification",
|
|
1654
|
+
"VaccinChatNLClassification",
|
|
1655
|
+
"DutchColaClassification",
|
|
1656
|
+
"DutchGovernmentBiasClassification",
|
|
1657
|
+
"DutchSarcasticHeadlinesClassification",
|
|
1658
|
+
"DutchNewsArticlesClassification",
|
|
1659
|
+
"OpenTenderClassification",
|
|
1660
|
+
"IconclassClassification",
|
|
1661
|
+
# # PairClassification
|
|
1662
|
+
"SICKNLPairClassification",
|
|
1663
|
+
"XLWICNLPairClassification",
|
|
1664
|
+
# # MultiLabelClassification
|
|
1665
|
+
"CovidDisinformationNLMultiLabelClassification",
|
|
1666
|
+
"MultiEURLEXMultilabelClassification",
|
|
1667
|
+
"VABBMultiLabelClassification",
|
|
1668
|
+
# # Clustering
|
|
1669
|
+
"DutchNewsArticlesClusteringS2S",
|
|
1670
|
+
"DutchNewsArticlesClusteringP2P",
|
|
1671
|
+
"SIB200ClusteringS2S",
|
|
1672
|
+
"VABBClusteringS2S",
|
|
1673
|
+
"VABBClusteringP2P",
|
|
1674
|
+
"OpenTenderClusteringS2S",
|
|
1675
|
+
"OpenTenderClusteringP2P",
|
|
1676
|
+
"IconclassClusteringS2S",
|
|
1677
|
+
# # Reranking
|
|
1678
|
+
"WikipediaRerankingMultilingual",
|
|
1679
|
+
# # Retrieval
|
|
1680
|
+
"ArguAna-NL",
|
|
1681
|
+
"SCIDOCS-NL",
|
|
1682
|
+
"SciFact-NL",
|
|
1683
|
+
"NFCorpus-NL",
|
|
1684
|
+
"BelebeleRetrieval",
|
|
1685
|
+
# "WebFAQRetrieval",
|
|
1686
|
+
"DutchNewsArticlesRetrieval",
|
|
1687
|
+
"bBSARDNLRetrieval",
|
|
1688
|
+
"LegalQANLRetrieval",
|
|
1689
|
+
"OpenTenderRetrieval",
|
|
1690
|
+
"VABBRetrieval",
|
|
1691
|
+
"WikipediaRetrievalMultilingual",
|
|
1692
|
+
# # STS
|
|
1693
|
+
"SICK-NL-STS",
|
|
1694
|
+
"STSBenchmarkMultilingualSTS",
|
|
1695
|
+
],
|
|
1696
|
+
)
|
|
1697
|
+
),
|
|
1698
|
+
description="MTEB-NL",
|
|
1699
|
+
reference="https://arxiv.org/abs/2509.12340",
|
|
1700
|
+
contacts=["nikolay-banar"],
|
|
1701
|
+
citation=r"""
|
|
1702
|
+
@misc{banar2025mtebnle5nlembeddingbenchmark,
|
|
1703
|
+
archiveprefix = {arXiv},
|
|
1704
|
+
author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
|
|
1705
|
+
eprint = {22509.12340},
|
|
1706
|
+
primaryclass = {cs.CL},
|
|
1707
|
+
title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
|
|
1708
|
+
url = {https://arxiv.org/abs/2509.12340},
|
|
1709
|
+
year = {2025},
|
|
1710
|
+
}
|
|
1711
|
+
""",
|
|
1712
|
+
)
|
|
1713
|
+
|
|
1639
1714
|
MIEB_common_tasks = [
|
|
1640
1715
|
# Image Classification
|
|
1641
1716
|
"Birdsnap", # fine
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 2400,
|
|
4
|
+
"number_texts_intersect_with_train": null,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 92146,
|
|
7
|
+
"min_text_length": 5,
|
|
8
|
+
"average_text_length": 38.39416666666666,
|
|
9
|
+
"max_text_length": 138,
|
|
10
|
+
"unique_texts": 2400
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 2,
|
|
18
|
+
"labels": {
|
|
19
|
+
"1": {
|
|
20
|
+
"count": 1200
|
|
21
|
+
},
|
|
22
|
+
"0": {
|
|
23
|
+
"count": 1200
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"train": {
|
|
29
|
+
"num_samples": 19893,
|
|
30
|
+
"number_texts_intersect_with_train": null,
|
|
31
|
+
"text_statistics": {
|
|
32
|
+
"total_text_length": 761416,
|
|
33
|
+
"min_text_length": 4,
|
|
34
|
+
"average_text_length": 38.27557432262605,
|
|
35
|
+
"max_text_length": 152,
|
|
36
|
+
"unique_texts": 19893
|
|
37
|
+
},
|
|
38
|
+
"image_statistics": null,
|
|
39
|
+
"label_statistics": {
|
|
40
|
+
"min_labels_per_text": 1,
|
|
41
|
+
"average_label_per_text": 1.0,
|
|
42
|
+
"max_labels_per_text": 1,
|
|
43
|
+
"unique_labels": 2,
|
|
44
|
+
"labels": {
|
|
45
|
+
"1": {
|
|
46
|
+
"count": 12604
|
|
47
|
+
},
|
|
48
|
+
"0": {
|
|
49
|
+
"count": 7289
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 752,
|
|
4
|
+
"number_texts_intersect_with_train": 100,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 171956,
|
|
7
|
+
"min_text_length": 32,
|
|
8
|
+
"average_text_length": 228.66489361702128,
|
|
9
|
+
"max_text_length": 2746,
|
|
10
|
+
"unique_texts": 752
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 2,
|
|
18
|
+
"labels": {
|
|
19
|
+
"0.0": {
|
|
20
|
+
"count": 555
|
|
21
|
+
},
|
|
22
|
+
"1.0": {
|
|
23
|
+
"count": 197
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"train": {
|
|
29
|
+
"num_samples": 1718,
|
|
30
|
+
"number_texts_intersect_with_train": null,
|
|
31
|
+
"text_statistics": {
|
|
32
|
+
"total_text_length": 390362,
|
|
33
|
+
"min_text_length": 18,
|
|
34
|
+
"average_text_length": 227.2188591385332,
|
|
35
|
+
"max_text_length": 2662,
|
|
36
|
+
"unique_texts": 1718
|
|
37
|
+
},
|
|
38
|
+
"image_statistics": null,
|
|
39
|
+
"label_statistics": {
|
|
40
|
+
"min_labels_per_text": 1,
|
|
41
|
+
"average_label_per_text": 1.0,
|
|
42
|
+
"max_labels_per_text": 1,
|
|
43
|
+
"unique_labels": 2,
|
|
44
|
+
"labels": {
|
|
45
|
+
"1.0": {
|
|
46
|
+
"count": 470
|
|
47
|
+
},
|
|
48
|
+
"0.0": {
|
|
49
|
+
"count": 1248
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 1200,
|
|
4
|
+
"number_texts_intersect_with_train": 1,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 2034506,
|
|
7
|
+
"min_text_length": 184,
|
|
8
|
+
"average_text_length": 1695.4216666666666,
|
|
9
|
+
"max_text_length": 8825,
|
|
10
|
+
"unique_texts": 1200
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 8,
|
|
18
|
+
"labels": {
|
|
19
|
+
"Opmerkelijk": {
|
|
20
|
+
"count": 150
|
|
21
|
+
},
|
|
22
|
+
"Buitenland": {
|
|
23
|
+
"count": 150
|
|
24
|
+
},
|
|
25
|
+
"Cultuur & Media": {
|
|
26
|
+
"count": 150
|
|
27
|
+
},
|
|
28
|
+
"Binnenland": {
|
|
29
|
+
"count": 150
|
|
30
|
+
},
|
|
31
|
+
"Politiek": {
|
|
32
|
+
"count": 150
|
|
33
|
+
},
|
|
34
|
+
"Economie": {
|
|
35
|
+
"count": 150
|
|
36
|
+
},
|
|
37
|
+
"Tech": {
|
|
38
|
+
"count": 150
|
|
39
|
+
},
|
|
40
|
+
"Regionaal nieuws": {
|
|
41
|
+
"count": 150
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"train": {
|
|
47
|
+
"num_samples": 5600,
|
|
48
|
+
"number_texts_intersect_with_train": null,
|
|
49
|
+
"text_statistics": {
|
|
50
|
+
"total_text_length": 9620538,
|
|
51
|
+
"min_text_length": 106,
|
|
52
|
+
"average_text_length": 1717.9532142857142,
|
|
53
|
+
"max_text_length": 29389,
|
|
54
|
+
"unique_texts": 5600
|
|
55
|
+
},
|
|
56
|
+
"image_statistics": null,
|
|
57
|
+
"label_statistics": {
|
|
58
|
+
"min_labels_per_text": 1,
|
|
59
|
+
"average_label_per_text": 1.0,
|
|
60
|
+
"max_labels_per_text": 1,
|
|
61
|
+
"unique_labels": 8,
|
|
62
|
+
"labels": {
|
|
63
|
+
"Cultuur & Media": {
|
|
64
|
+
"count": 700
|
|
65
|
+
},
|
|
66
|
+
"Binnenland": {
|
|
67
|
+
"count": 700
|
|
68
|
+
},
|
|
69
|
+
"Buitenland": {
|
|
70
|
+
"count": 700
|
|
71
|
+
},
|
|
72
|
+
"Regionaal nieuws": {
|
|
73
|
+
"count": 700
|
|
74
|
+
},
|
|
75
|
+
"Politiek": {
|
|
76
|
+
"count": 700
|
|
77
|
+
},
|
|
78
|
+
"Economie": {
|
|
79
|
+
"count": 700
|
|
80
|
+
},
|
|
81
|
+
"Opmerkelijk": {
|
|
82
|
+
"count": 700
|
|
83
|
+
},
|
|
84
|
+
"Tech": {
|
|
85
|
+
"count": 700
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 1326,
|
|
4
|
+
"number_texts_intersect_with_train": null,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 82644,
|
|
7
|
+
"min_text_length": 17,
|
|
8
|
+
"average_text_length": 62.32579185520362,
|
|
9
|
+
"max_text_length": 117,
|
|
10
|
+
"unique_texts": 1326
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 2,
|
|
18
|
+
"labels": {
|
|
19
|
+
"0": {
|
|
20
|
+
"count": 826
|
|
21
|
+
},
|
|
22
|
+
"1": {
|
|
23
|
+
"count": 500
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"train": {
|
|
29
|
+
"num_samples": 10609,
|
|
30
|
+
"number_texts_intersect_with_train": null,
|
|
31
|
+
"text_statistics": {
|
|
32
|
+
"total_text_length": 658787,
|
|
33
|
+
"min_text_length": 7,
|
|
34
|
+
"average_text_length": 62.09699311904986,
|
|
35
|
+
"max_text_length": 161,
|
|
36
|
+
"unique_texts": 10609
|
|
37
|
+
},
|
|
38
|
+
"image_statistics": null,
|
|
39
|
+
"label_statistics": {
|
|
40
|
+
"min_labels_per_text": 1,
|
|
41
|
+
"average_label_per_text": 1.0,
|
|
42
|
+
"max_labels_per_text": 1,
|
|
43
|
+
"unique_labels": 2,
|
|
44
|
+
"labels": {
|
|
45
|
+
"1": {
|
|
46
|
+
"count": 4000
|
|
47
|
+
},
|
|
48
|
+
"0": {
|
|
49
|
+
"count": 6609
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 202,
|
|
4
|
+
"number_texts_intersect_with_train": null,
|
|
5
|
+
"text_statistics": {
|
|
6
|
+
"total_text_length": 11827,
|
|
7
|
+
"min_text_length": 6,
|
|
8
|
+
"average_text_length": 58.54950495049505,
|
|
9
|
+
"max_text_length": 403,
|
|
10
|
+
"unique_texts": 202
|
|
11
|
+
},
|
|
12
|
+
"image_statistics": null,
|
|
13
|
+
"label_statistics": {
|
|
14
|
+
"min_labels_per_text": 1,
|
|
15
|
+
"average_label_per_text": 1.0,
|
|
16
|
+
"max_labels_per_text": 1,
|
|
17
|
+
"unique_labels": 9,
|
|
18
|
+
"labels": {
|
|
19
|
+
"Geschiedenis": {
|
|
20
|
+
"count": 22
|
|
21
|
+
},
|
|
22
|
+
"Klassieke mythologie en Oude Geschiedenis": {
|
|
23
|
+
"count": 22
|
|
24
|
+
},
|
|
25
|
+
"Literatuur": {
|
|
26
|
+
"count": 23
|
|
27
|
+
},
|
|
28
|
+
"Natuur": {
|
|
29
|
+
"count": 23
|
|
30
|
+
},
|
|
31
|
+
"De mens, de mensheid in het algemeen": {
|
|
32
|
+
"count": 22
|
|
33
|
+
},
|
|
34
|
+
"Maatschappij, civilisatie en cultuur": {
|
|
35
|
+
"count": 22
|
|
36
|
+
},
|
|
37
|
+
"Abstracte idee\u00ebn en concepten": {
|
|
38
|
+
"count": 23
|
|
39
|
+
},
|
|
40
|
+
"Religie en magie": {
|
|
41
|
+
"count": 22
|
|
42
|
+
},
|
|
43
|
+
"Bijbel": {
|
|
44
|
+
"count": 23
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"train": {
|
|
50
|
+
"num_samples": 945,
|
|
51
|
+
"number_texts_intersect_with_train": null,
|
|
52
|
+
"text_statistics": {
|
|
53
|
+
"total_text_length": 52510,
|
|
54
|
+
"min_text_length": 3,
|
|
55
|
+
"average_text_length": 55.56613756613756,
|
|
56
|
+
"max_text_length": 793,
|
|
57
|
+
"unique_texts": 945
|
|
58
|
+
},
|
|
59
|
+
"image_statistics": null,
|
|
60
|
+
"label_statistics": {
|
|
61
|
+
"min_labels_per_text": 1,
|
|
62
|
+
"average_label_per_text": 1.0,
|
|
63
|
+
"max_labels_per_text": 1,
|
|
64
|
+
"unique_labels": 9,
|
|
65
|
+
"labels": {
|
|
66
|
+
"Literatuur": {
|
|
67
|
+
"count": 105
|
|
68
|
+
},
|
|
69
|
+
"Maatschappij, civilisatie en cultuur": {
|
|
70
|
+
"count": 105
|
|
71
|
+
},
|
|
72
|
+
"Klassieke mythologie en Oude Geschiedenis": {
|
|
73
|
+
"count": 105
|
|
74
|
+
},
|
|
75
|
+
"Bijbel": {
|
|
76
|
+
"count": 105
|
|
77
|
+
},
|
|
78
|
+
"De mens, de mensheid in het algemeen": {
|
|
79
|
+
"count": 105
|
|
80
|
+
},
|
|
81
|
+
"Abstracte idee\u00ebn en concepten": {
|
|
82
|
+
"count": 105
|
|
83
|
+
},
|
|
84
|
+
"Natuur": {
|
|
85
|
+
"count": 105
|
|
86
|
+
},
|
|
87
|
+
"Geschiedenis": {
|
|
88
|
+
"count": 105
|
|
89
|
+
},
|
|
90
|
+
"Religie en magie": {
|
|
91
|
+
"count": 105
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|