mteb 2.0.4__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. mteb/__init__.py +10 -1
  2. mteb/benchmarks/benchmarks/__init__.py +2 -0
  3. mteb/benchmarks/benchmarks/benchmarks.py +75 -0
  4. mteb/descriptive_stats/BitextMining/BUCC.json +70 -40
  5. mteb/descriptive_stats/Classification/DKHateClassification.json +40 -24
  6. mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
  7. mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
  8. mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
  9. mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
  10. mteb/descriptive_stats/Classification/FinancialPhrasebankClassification.json +23 -15
  11. mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
  12. mteb/descriptive_stats/Classification/ImdbClassification.json +40 -24
  13. mteb/descriptive_stats/Classification/KorHateClassification.json +23 -15
  14. mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
  15. mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
  16. mteb/descriptive_stats/Clustering/ArxivClusteringP2P.json +555 -550
  17. mteb/descriptive_stats/Clustering/ArxivClusteringP2P.v2.json +546 -541
  18. mteb/descriptive_stats/Clustering/ArxivClusteringS2S.json +555 -550
  19. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
  20. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
  21. mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
  22. mteb/descriptive_stats/Clustering/MLSUMClusteringP2P.json +2466 -2416
  23. mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
  24. mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
  25. mteb/descriptive_stats/Clustering/RedditClusteringP2P.json +1365 -1360
  26. mteb/descriptive_stats/Clustering/SNLClustering.json +378 -373
  27. mteb/descriptive_stats/Clustering/SwednClustering.json +28 -23
  28. mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
  29. mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
  30. mteb/descriptive_stats/Clustering/VGClustering.json +54 -49
  31. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/WITT2IRetrieval.json +324 -204
  32. mteb/descriptive_stats/Image/Any2AnyRetrieval/MemotionI2TRetrieval.json +28 -18
  33. mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRAirbnbSyntheticRetrieval.json +334 -0
  34. mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRGitHubReadmeRetrieval.json +544 -0
  35. mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRTweetStockSyntheticsRetrieval.json +334 -0
  36. mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRWikimediaCommonsDocumentsRetrieval.json +634 -0
  37. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore2ESGReportsRetrieval.json +154 -0
  38. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore2EconomicsReportsRetrieval.json +154 -0
  39. mteb/descriptive_stats/Image/ImageClassification/Imagenet1k.json +6039 -3007
  40. mteb/descriptive_stats/Image/ZeroShotClassification/Imagenet1kZeroShot.json +3024 -3010
  41. mteb/descriptive_stats/Image/ZeroShotClassification/PatchCamelyonZeroShot.json +30 -16
  42. mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
  43. mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
  44. mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
  45. mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
  46. mteb/descriptive_stats/Reranking/MIRACLReranking.json +555 -479
  47. mteb/descriptive_stats/Reranking/MindSmallReranking.json +29 -25
  48. mteb/descriptive_stats/Retrieval/AlloprofRetrieval.json +25 -26
  49. mteb/descriptive_stats/Retrieval/Code1Retrieval.json +30 -0
  50. mteb/descriptive_stats/Retrieval/DanFEVER.json +25 -26
  51. mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
  52. mteb/descriptive_stats/Retrieval/EnglishFinance1Retrieval.json +30 -0
  53. mteb/descriptive_stats/Retrieval/EnglishFinance2Retrieval.json +30 -0
  54. mteb/descriptive_stats/Retrieval/EnglishFinance3Retrieval.json +30 -0
  55. mteb/descriptive_stats/Retrieval/EnglishFinance4Retrieval.json +30 -0
  56. mteb/descriptive_stats/Retrieval/EnglishHealthcare1Retrieval.json +30 -0
  57. mteb/descriptive_stats/Retrieval/French1Retrieval.json +30 -0
  58. mteb/descriptive_stats/Retrieval/FrenchLegal1Retrieval.json +30 -0
  59. mteb/descriptive_stats/Retrieval/German1Retrieval.json +30 -0
  60. mteb/descriptive_stats/Retrieval/GermanHealthcare1Retrieval.json +30 -0
  61. mteb/descriptive_stats/Retrieval/GermanLegal1Retrieval.json +30 -0
  62. mteb/descriptive_stats/Retrieval/JapaneseCode1Retrieval.json +30 -0
  63. mteb/descriptive_stats/Retrieval/JapaneseLegal1Retrieval.json +30 -0
  64. mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
  65. mteb/descriptive_stats/Retrieval/MIRACLRetrieval.json +475 -494
  66. mteb/descriptive_stats/Retrieval/MSMARCO-Fa.json +25 -26
  67. mteb/descriptive_stats/Retrieval/MSMARCO.json +25 -84
  68. mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
  69. mteb/descriptive_stats/Retrieval/Touche2020.json +25 -26
  70. mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
  71. mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
  72. mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
  73. mteb/descriptive_stats/Summarization/SummEval.json +27 -50
  74. mteb/descriptive_stats/Summarization/SummEvalFr.json +27 -50
  75. mteb/models/model_implementations/kalm_models.py +29 -0
  76. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  77. mteb/tasks/classification/eng/financial_phrasebank_classification.py +0 -3
  78. mteb/tasks/classification/kor/kor_hate_classification.py +0 -12
  79. mteb/tasks/classification/nld/__init__.py +16 -0
  80. mteb/tasks/classification/nld/dutch_cola_classification.py +38 -0
  81. mteb/tasks/classification/nld/dutch_government_bias_classification.py +37 -0
  82. mteb/tasks/classification/nld/dutch_news_articles_classification.py +30 -0
  83. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +36 -0
  84. mteb/tasks/classification/nld/iconclass_classification.py +41 -0
  85. mteb/tasks/classification/nld/open_tender_classification.py +38 -0
  86. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +46 -0
  87. mteb/tasks/clustering/__init__.py +1 -0
  88. mteb/tasks/clustering/nld/__init__.py +17 -0
  89. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +37 -0
  90. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +37 -0
  91. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +47 -0
  92. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +51 -0
  93. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +41 -0
  94. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +51 -0
  95. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +51 -0
  96. mteb/tasks/clustering/swe/swedn_clustering.py +2 -2
  97. mteb/tasks/multilabel_classification/__init__.py +1 -0
  98. mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
  99. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +88 -0
  100. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +44 -0
  101. mteb/tasks/pair_classification/__init__.py +1 -0
  102. mteb/tasks/pair_classification/nld/__init__.py +7 -0
  103. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +36 -0
  104. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +41 -0
  105. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  106. mteb/tasks/retrieval/nld/__init__.py +10 -0
  107. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +41 -0
  108. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +30 -0
  109. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +39 -0
  110. mteb/tasks/retrieval/nld/open_tender_retrieval.py +38 -0
  111. mteb/tasks/retrieval/nld/vabb_retrieval.py +41 -0
  112. mteb/tasks/sts/__init__.py +1 -0
  113. mteb/tasks/sts/nld/__init__.py +5 -0
  114. mteb/tasks/sts/nld/sick_nl_sts.py +41 -0
  115. {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/METADATA +2 -204
  116. {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/RECORD +120 -49
  117. {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/WHEEL +0 -0
  118. {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/entry_points.txt +0 -0
  119. {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/licenses/LICENSE +0 -0
  120. {mteb-2.0.4.dist-info → mteb-2.1.0.dist-info}/top_level.txt +0 -0
mteb/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from importlib.metadata import version
2
2
 
3
+ from mteb import types
3
4
  from mteb.abstasks import AbsTask
4
5
  from mteb.abstasks.task_metadata import TaskMetadata
5
6
  from mteb.deprecated_evaluator import MTEB
@@ -7,7 +8,12 @@ from mteb.evaluate import evaluate
7
8
  from mteb.filter_tasks import filter_tasks
8
9
  from mteb.get_tasks import get_task, get_tasks
9
10
  from mteb.load_results import load_results
10
- from mteb.models import EncoderProtocol, SentenceTransformerEncoderWrapper
11
+ from mteb.models import (
12
+ CrossEncoderProtocol,
13
+ EncoderProtocol,
14
+ SearchProtocol,
15
+ SentenceTransformerEncoderWrapper,
16
+ )
11
17
  from mteb.models.get_model_meta import get_model, get_model_meta, get_model_metas
12
18
  from mteb.results import BenchmarkResults, TaskResult
13
19
 
@@ -21,7 +27,9 @@ __all__ = [
21
27
  "AbsTask",
22
28
  "Benchmark",
23
29
  "BenchmarkResults",
30
+ "CrossEncoderProtocol",
24
31
  "EncoderProtocol",
32
+ "SearchProtocol",
25
33
  "SentenceTransformerEncoderWrapper",
26
34
  "TaskMetadata",
27
35
  "TaskResult",
@@ -35,4 +43,5 @@ __all__ = [
35
43
  "get_task",
36
44
  "get_tasks",
37
45
  "load_results",
46
+ "types",
38
47
  ]
@@ -27,6 +27,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
27
27
  MTEB_KOR,
28
28
  MTEB_MAIN_RU,
29
29
  MTEB_MINERS_BITEXT_MINING,
30
+ MTEB_NL,
30
31
  MTEB_POL,
31
32
  MTEB_RETRIEVAL_LAW,
32
33
  MTEB_RETRIEVAL_MEDICAL,
@@ -87,6 +88,7 @@ __all__ = [
87
88
  "MTEB_KOR",
88
89
  "MTEB_MAIN_RU",
89
90
  "MTEB_MINERS_BITEXT_MINING",
91
+ "MTEB_NL",
90
92
  "MTEB_POL",
91
93
  "MTEB_RETRIEVAL_LAW",
92
94
  "MTEB_RETRIEVAL_MEDICAL",
@@ -1636,6 +1636,81 @@ BEIR_NL = Benchmark(
1636
1636
  """,
1637
1637
  )
1638
1638
 
1639
+ MTEB_NL = Benchmark(
1640
+ name="MTEB(nld, v1)",
1641
+ display_name="MTEB-NL",
1642
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
1643
+ tasks=MTEBTasks(
1644
+ get_tasks(
1645
+ languages=["nld"],
1646
+ exclusive_language_filter=True,
1647
+ tasks=[
1648
+ # Classification
1649
+ "DutchBookReviewSentimentClassification",
1650
+ "MassiveIntentClassification",
1651
+ "MassiveScenarioClassification",
1652
+ "SIB200Classification",
1653
+ "MultiHateClassification",
1654
+ "VaccinChatNLClassification",
1655
+ "DutchColaClassification",
1656
+ "DutchGovernmentBiasClassification",
1657
+ "DutchSarcasticHeadlinesClassification",
1658
+ "DutchNewsArticlesClassification",
1659
+ "OpenTenderClassification",
1660
+ "IconclassClassification",
1661
+ # # PairClassification
1662
+ "SICKNLPairClassification",
1663
+ "XLWICNLPairClassification",
1664
+ # # MultiLabelClassification
1665
+ "CovidDisinformationNLMultiLabelClassification",
1666
+ "MultiEURLEXMultilabelClassification",
1667
+ "VABBMultiLabelClassification",
1668
+ # # Clustering
1669
+ "DutchNewsArticlesClusteringS2S",
1670
+ "DutchNewsArticlesClusteringP2P",
1671
+ "SIB200ClusteringS2S",
1672
+ "VABBClusteringS2S",
1673
+ "VABBClusteringP2P",
1674
+ "OpenTenderClusteringS2S",
1675
+ "OpenTenderClusteringP2P",
1676
+ "IconclassClusteringS2S",
1677
+ # # Reranking
1678
+ "WikipediaRerankingMultilingual",
1679
+ # # Retrieval
1680
+ "ArguAna-NL",
1681
+ "SCIDOCS-NL",
1682
+ "SciFact-NL",
1683
+ "NFCorpus-NL",
1684
+ "BelebeleRetrieval",
1685
+ # "WebFAQRetrieval",
1686
+ "DutchNewsArticlesRetrieval",
1687
+ "bBSARDNLRetrieval",
1688
+ "LegalQANLRetrieval",
1689
+ "OpenTenderRetrieval",
1690
+ "VABBRetrieval",
1691
+ "WikipediaRetrievalMultilingual",
1692
+ # # STS
1693
+ "SICK-NL-STS",
1694
+ "STSBenchmarkMultilingualSTS",
1695
+ ],
1696
+ )
1697
+ ),
1698
+ description="MTEB-NL",
1699
+ reference="https://arxiv.org/abs/2509.12340",
1700
+ contacts=["nikolay-banar"],
1701
+ citation=r"""
1702
+ @misc{banar2025mtebnle5nlembeddingbenchmark,
1703
+ archiveprefix = {arXiv},
1704
+ author = {Nikolay Banar and Ehsan Lotfi and Jens Van Nooten and Cristina Arhiliuc and Marija Kliocaite and Walter Daelemans},
1705
+ eprint = {22509.12340},
1706
+ primaryclass = {cs.CL},
1707
+ title = {MTEB-NL and E5-NL: Embedding Benchmark and Models for Dutch},
1708
+ url = {https://arxiv.org/abs/2509.12340},
1709
+ year = {2025},
1710
+ }
1711
+ """,
1712
+ )
1713
+
1639
1714
  MIEB_common_tasks = [
1640
1715
  # Image Classification
1641
1716
  "Birdsnap", # fine
@@ -3,66 +3,96 @@
3
3
  "num_samples": 35000,
4
4
  "number_of_characters": 146737556,
5
5
  "unique_pairs": 35000,
6
- "min_sentence1_length": 16,
7
- "average_sentence1_length": 99.10931428571429,
8
- "max_sentence1_length": 204,
9
- "unique_sentence1": 34978,
10
- "min_sentence2_length": 17,
11
- "average_sentence2_length": 101.14933691422246,
12
- "max_sentence2_length": 339,
13
- "unique_sentence2": 1133728,
6
+ "sentence1_statistics": {
7
+ "total_text_length": 3468826,
8
+ "min_text_length": 16,
9
+ "average_text_length": 99.10931428571429,
10
+ "max_text_length": 204,
11
+ "unique_texts": 34978
12
+ },
13
+ "sentence2_statistics": {
14
+ "total_text_length": 143268730,
15
+ "min_text_length": 17,
16
+ "average_text_length": 101.14933691422246,
17
+ "max_text_length": 339,
18
+ "unique_texts": 1133728
19
+ },
14
20
  "hf_subset_descriptive_stats": {
15
21
  "de-en": {
16
22
  "num_samples": 9580,
17
23
  "number_of_characters": 41450074,
18
24
  "unique_pairs": 9580,
19
- "min_sentence1_length": 50,
20
- "average_sentence1_length": 109.07974947807934,
21
- "max_sentence1_length": 204,
22
- "unique_sentence1": 9573,
23
- "min_sentence2_length": 17,
24
- "average_sentence2_length": 101.18043156531952,
25
- "max_sentence2_length": 293,
26
- "unique_sentence2": 397151
25
+ "sentence1_statistics": {
26
+ "total_text_length": 1044984,
27
+ "min_text_length": 50,
28
+ "average_text_length": 109.07974947807934,
29
+ "max_text_length": 204,
30
+ "unique_texts": 9573
31
+ },
32
+ "sentence2_statistics": {
33
+ "total_text_length": 40405090,
34
+ "min_text_length": 17,
35
+ "average_text_length": 101.18043156531952,
36
+ "max_text_length": 293,
37
+ "unique_texts": 397151
38
+ }
27
39
  },
28
40
  "fr-en": {
29
41
  "num_samples": 9086,
30
42
  "number_of_characters": 38272453,
31
43
  "unique_pairs": 9086,
32
- "min_sentence1_length": 43,
33
- "average_sentence1_length": 99.31785163988553,
34
- "max_sentence1_length": 174,
35
- "unique_sentence1": 9081,
36
- "min_sentence2_length": 21,
37
- "average_sentence2_length": 101.05202942051324,
38
- "max_sentence2_length": 319,
39
- "unique_sentence2": 368033
44
+ "sentence1_statistics": {
45
+ "total_text_length": 902402,
46
+ "min_text_length": 43,
47
+ "average_text_length": 99.31785163988553,
48
+ "max_text_length": 174,
49
+ "unique_texts": 9081
50
+ },
51
+ "sentence2_statistics": {
52
+ "total_text_length": 37370051,
53
+ "min_text_length": 21,
54
+ "average_text_length": 101.05202942051324,
55
+ "max_text_length": 319,
56
+ "unique_texts": 368033
57
+ }
40
58
  },
41
59
  "ru-en": {
42
60
  "num_samples": 14435,
43
61
  "number_of_characters": 57904085,
44
62
  "unique_pairs": 14435,
45
- "min_sentence1_length": 40,
46
- "average_sentence1_length": 101.6593003117423,
47
- "max_sentence1_length": 186,
48
- "unique_sentence1": 14425,
49
- "min_sentence2_length": 21,
50
- "average_sentence2_length": 101.06828784332406,
51
- "max_sentence2_length": 339,
52
- "unique_sentence2": 555503
63
+ "sentence1_statistics": {
64
+ "total_text_length": 1467452,
65
+ "min_text_length": 40,
66
+ "average_text_length": 101.6593003117423,
67
+ "max_text_length": 186,
68
+ "unique_texts": 14425
69
+ },
70
+ "sentence2_statistics": {
71
+ "total_text_length": 56436633,
72
+ "min_text_length": 21,
73
+ "average_text_length": 101.06828784332406,
74
+ "max_text_length": 339,
75
+ "unique_texts": 555503
76
+ }
53
77
  },
54
78
  "zh-en": {
55
79
  "num_samples": 1899,
56
80
  "number_of_characters": 9110944,
57
81
  "unique_pairs": 1899,
58
- "min_sentence1_length": 16,
59
- "average_sentence1_length": 28.429699842022117,
60
- "max_sentence1_length": 40,
61
- "unique_sentence1": 1899,
62
- "min_sentence2_length": 22,
63
- "average_sentence2_length": 101.92388026108485,
64
- "max_sentence2_length": 249,
65
- "unique_sentence2": 88360
82
+ "sentence1_statistics": {
83
+ "total_text_length": 53988,
84
+ "min_text_length": 16,
85
+ "average_text_length": 28.429699842022117,
86
+ "max_text_length": 40,
87
+ "unique_texts": 1899
88
+ },
89
+ "sentence2_statistics": {
90
+ "total_text_length": 9056956,
91
+ "min_text_length": 22,
92
+ "average_text_length": 101.92388026108485,
93
+ "max_text_length": 249,
94
+ "unique_texts": 88360
95
+ }
66
96
  }
67
97
  }
68
98
  }
@@ -1,37 +1,53 @@
1
1
  {
2
2
  "test": {
3
3
  "num_samples": 329,
4
- "number_of_characters": 29011,
5
4
  "number_texts_intersect_with_train": 4,
6
- "min_text_length": 1,
7
- "average_text_length": 88.17933130699087,
8
- "max_text_length": 2434,
9
- "unique_text": 326,
10
- "unique_labels": 2,
11
- "labels": {
12
- "0": {
13
- "count": 288
14
- },
15
- "1": {
16
- "count": 41
5
+ "text_statistics": {
6
+ "total_text_length": 29011,
7
+ "min_text_length": 1,
8
+ "average_text_length": 88.17933130699087,
9
+ "max_text_length": 2434,
10
+ "unique_texts": 326
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "0": {
20
+ "count": 288
21
+ },
22
+ "1": {
23
+ "count": 41
24
+ }
17
25
  }
18
26
  }
19
27
  },
20
28
  "train": {
21
29
  "num_samples": 2960,
22
- "number_of_characters": 307722,
23
30
  "number_texts_intersect_with_train": null,
24
- "min_text_length": 1,
25
- "average_text_length": 103.96013513513513,
26
- "max_text_length": 5403,
27
- "unique_text": 2902,
28
- "unique_labels": 2,
29
- "labels": {
30
- "0": {
31
- "count": 2576
32
- },
33
- "1": {
34
- "count": 384
31
+ "text_statistics": {
32
+ "total_text_length": 307722,
33
+ "min_text_length": 1,
34
+ "average_text_length": 103.96013513513513,
35
+ "max_text_length": 5403,
36
+ "unique_texts": 2902
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "0": {
46
+ "count": 2576
47
+ },
48
+ "1": {
49
+ "count": 384
50
+ }
35
51
  }
36
52
  }
37
53
  }
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 2400,
4
+ "number_texts_intersect_with_train": null,
5
+ "text_statistics": {
6
+ "total_text_length": 92146,
7
+ "min_text_length": 5,
8
+ "average_text_length": 38.39416666666666,
9
+ "max_text_length": 138,
10
+ "unique_texts": 2400
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "1": {
20
+ "count": 1200
21
+ },
22
+ "0": {
23
+ "count": 1200
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 19893,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 761416,
33
+ "min_text_length": 4,
34
+ "average_text_length": 38.27557432262605,
35
+ "max_text_length": 152,
36
+ "unique_texts": 19893
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1": {
46
+ "count": 12604
47
+ },
48
+ "0": {
49
+ "count": 7289
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 752,
4
+ "number_texts_intersect_with_train": 100,
5
+ "text_statistics": {
6
+ "total_text_length": 171956,
7
+ "min_text_length": 32,
8
+ "average_text_length": 228.66489361702128,
9
+ "max_text_length": 2746,
10
+ "unique_texts": 752
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "0.0": {
20
+ "count": 555
21
+ },
22
+ "1.0": {
23
+ "count": 197
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 1718,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 390362,
33
+ "min_text_length": 18,
34
+ "average_text_length": 227.2188591385332,
35
+ "max_text_length": 2662,
36
+ "unique_texts": 1718
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1.0": {
46
+ "count": 470
47
+ },
48
+ "0.0": {
49
+ "count": 1248
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,90 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1200,
4
+ "number_texts_intersect_with_train": 1,
5
+ "text_statistics": {
6
+ "total_text_length": 2034506,
7
+ "min_text_length": 184,
8
+ "average_text_length": 1695.4216666666666,
9
+ "max_text_length": 8825,
10
+ "unique_texts": 1200
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 8,
18
+ "labels": {
19
+ "Opmerkelijk": {
20
+ "count": 150
21
+ },
22
+ "Buitenland": {
23
+ "count": 150
24
+ },
25
+ "Cultuur & Media": {
26
+ "count": 150
27
+ },
28
+ "Binnenland": {
29
+ "count": 150
30
+ },
31
+ "Politiek": {
32
+ "count": 150
33
+ },
34
+ "Economie": {
35
+ "count": 150
36
+ },
37
+ "Tech": {
38
+ "count": 150
39
+ },
40
+ "Regionaal nieuws": {
41
+ "count": 150
42
+ }
43
+ }
44
+ }
45
+ },
46
+ "train": {
47
+ "num_samples": 5600,
48
+ "number_texts_intersect_with_train": null,
49
+ "text_statistics": {
50
+ "total_text_length": 9620538,
51
+ "min_text_length": 106,
52
+ "average_text_length": 1717.9532142857142,
53
+ "max_text_length": 29389,
54
+ "unique_texts": 5600
55
+ },
56
+ "image_statistics": null,
57
+ "label_statistics": {
58
+ "min_labels_per_text": 1,
59
+ "average_label_per_text": 1.0,
60
+ "max_labels_per_text": 1,
61
+ "unique_labels": 8,
62
+ "labels": {
63
+ "Cultuur & Media": {
64
+ "count": 700
65
+ },
66
+ "Binnenland": {
67
+ "count": 700
68
+ },
69
+ "Buitenland": {
70
+ "count": 700
71
+ },
72
+ "Regionaal nieuws": {
73
+ "count": 700
74
+ },
75
+ "Politiek": {
76
+ "count": 700
77
+ },
78
+ "Economie": {
79
+ "count": 700
80
+ },
81
+ "Opmerkelijk": {
82
+ "count": 700
83
+ },
84
+ "Tech": {
85
+ "count": 700
86
+ }
87
+ }
88
+ }
89
+ }
90
+ }
@@ -0,0 +1,54 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 1326,
4
+ "number_texts_intersect_with_train": null,
5
+ "text_statistics": {
6
+ "total_text_length": 82644,
7
+ "min_text_length": 17,
8
+ "average_text_length": 62.32579185520362,
9
+ "max_text_length": 117,
10
+ "unique_texts": 1326
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 2,
18
+ "labels": {
19
+ "0": {
20
+ "count": 826
21
+ },
22
+ "1": {
23
+ "count": 500
24
+ }
25
+ }
26
+ }
27
+ },
28
+ "train": {
29
+ "num_samples": 10609,
30
+ "number_texts_intersect_with_train": null,
31
+ "text_statistics": {
32
+ "total_text_length": 658787,
33
+ "min_text_length": 7,
34
+ "average_text_length": 62.09699311904986,
35
+ "max_text_length": 161,
36
+ "unique_texts": 10609
37
+ },
38
+ "image_statistics": null,
39
+ "label_statistics": {
40
+ "min_labels_per_text": 1,
41
+ "average_label_per_text": 1.0,
42
+ "max_labels_per_text": 1,
43
+ "unique_labels": 2,
44
+ "labels": {
45
+ "1": {
46
+ "count": 4000
47
+ },
48
+ "0": {
49
+ "count": 6609
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
@@ -1,22 +1,30 @@
1
1
  {
2
2
  "train": {
3
3
  "num_samples": 2264,
4
- "number_of_characters": 276123,
5
4
  "number_texts_intersect_with_train": null,
6
- "min_text_length": 9,
7
- "average_text_length": 121.96245583038869,
8
- "max_text_length": 315,
9
- "unique_text": 2259,
10
- "unique_labels": 3,
11
- "labels": {
12
- "1": {
13
- "count": 1391
14
- },
15
- "2": {
16
- "count": 570
17
- },
18
- "0": {
19
- "count": 303
5
+ "text_statistics": {
6
+ "total_text_length": 276123,
7
+ "min_text_length": 9,
8
+ "average_text_length": 121.96245583038869,
9
+ "max_text_length": 315,
10
+ "unique_texts": 2259
11
+ },
12
+ "image_statistics": null,
13
+ "label_statistics": {
14
+ "min_labels_per_text": 1,
15
+ "average_label_per_text": 1.0,
16
+ "max_labels_per_text": 1,
17
+ "unique_labels": 3,
18
+ "labels": {
19
+ "1": {
20
+ "count": 1391
21
+ },
22
+ "2": {
23
+ "count": 570
24
+ },
25
+ "0": {
26
+ "count": 303
27
+ }
20
28
  }
21
29
  }
22
30
  }