mteb 2.0.5__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. mteb/__init__.py +10 -1
  2. mteb/_create_dataloaders.py +2 -0
  3. mteb/abstasks/_stratification.py +1 -1
  4. mteb/abstasks/abstask.py +6 -1
  5. mteb/abstasks/dataset_card_template.md +1 -1
  6. mteb/abstasks/retrieval.py +2 -1
  7. mteb/abstasks/retrieval_dataset_loaders.py +1 -1
  8. mteb/abstasks/task_metadata.py +1 -1
  9. mteb/benchmarks/benchmarks/__init__.py +2 -0
  10. mteb/benchmarks/benchmarks/benchmarks.py +82 -11
  11. mteb/benchmarks/get_benchmark.py +1 -1
  12. mteb/descriptive_stats/Classification/DutchColaClassification.json +54 -0
  13. mteb/descriptive_stats/Classification/DutchGovernmentBiasClassification.json +54 -0
  14. mteb/descriptive_stats/Classification/DutchNewsArticlesClassification.json +90 -0
  15. mteb/descriptive_stats/Classification/DutchSarcasticHeadlinesClassification.json +54 -0
  16. mteb/descriptive_stats/Classification/IconclassClassification.json +96 -0
  17. mteb/descriptive_stats/Classification/OpenTenderClassification.json +222 -0
  18. mteb/descriptive_stats/Classification/VaccinChatNLClassification.json +1068 -0
  19. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringP2P.json +45 -0
  20. mteb/descriptive_stats/Clustering/DutchNewsArticlesClusteringS2S.json +45 -0
  21. mteb/descriptive_stats/Clustering/IconclassClusteringS2S.json +48 -0
  22. mteb/descriptive_stats/Clustering/OpenTenderClusteringP2P.json +111 -0
  23. mteb/descriptive_stats/Clustering/OpenTenderClusteringS2S.json +111 -0
  24. mteb/descriptive_stats/Clustering/VABBClusteringP2P.json +60 -0
  25. mteb/descriptive_stats/Clustering/VABBClusteringS2S.json +60 -0
  26. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
  27. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
  28. mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
  29. mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
  30. mteb/descriptive_stats/MultilabelClassification/CovidDisinformationNLMultiLabelClassification.json +84 -0
  31. mteb/descriptive_stats/MultilabelClassification/VABBMultiLabelClassification.json +156 -0
  32. mteb/descriptive_stats/PairClassification/SICKNLPairClassification.json +35 -0
  33. mteb/descriptive_stats/PairClassification/XLWICNLPairClassification.json +35 -0
  34. mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
  35. mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
  36. mteb/descriptive_stats/Retrieval/DutchNewsArticlesRetrieval.json +30 -0
  37. mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
  38. mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
  39. mteb/descriptive_stats/Retrieval/LegalQANLRetrieval.json +30 -0
  40. mteb/descriptive_stats/Retrieval/OpenTenderRetrieval.json +30 -0
  41. mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
  42. mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
  43. mteb/descriptive_stats/Retrieval/VABBRetrieval.json +30 -0
  44. mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
  45. mteb/descriptive_stats/Retrieval/bBSARDNLRetrieval.json +30 -0
  46. mteb/descriptive_stats/STS/SICK-NL-STS.json +28 -0
  47. mteb/languages/check_language_code.py +11 -3
  48. mteb/languages/language_scripts.py +4 -0
  49. mteb/leaderboard/text_segments.py +1 -1
  50. mteb/models/model_implementations/b1ade_models.py +1 -1
  51. mteb/models/model_implementations/bge_models.py +1 -3
  52. mteb/models/model_implementations/bmretriever_models.py +1 -1
  53. mteb/models/model_implementations/gme_v_models.py +2 -2
  54. mteb/models/model_implementations/ibm_granite_models.py +1 -1
  55. mteb/models/model_implementations/inf_models.py +3 -3
  56. mteb/models/model_implementations/jina_models.py +12 -2
  57. mteb/models/model_implementations/llm2vec_models.py +1 -1
  58. mteb/models/model_implementations/misc_models.py +2 -2
  59. mteb/models/model_implementations/mxbai_models.py +1 -1
  60. mteb/models/model_implementations/salesforce_models.py +1 -1
  61. mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
  62. mteb/models/model_implementations/voyage_v.py +9 -9
  63. mteb/results/task_result.py +6 -8
  64. mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
  65. mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
  66. mteb/tasks/classification/mya/myanmar_news.py +2 -2
  67. mteb/tasks/classification/nld/__init__.py +16 -0
  68. mteb/tasks/classification/nld/dutch_cola_classification.py +38 -0
  69. mteb/tasks/classification/nld/dutch_government_bias_classification.py +37 -0
  70. mteb/tasks/classification/nld/dutch_news_articles_classification.py +30 -0
  71. mteb/tasks/classification/nld/dutch_sarcastic_headlines_classification.py +36 -0
  72. mteb/tasks/classification/nld/iconclass_classification.py +41 -0
  73. mteb/tasks/classification/nld/open_tender_classification.py +38 -0
  74. mteb/tasks/classification/nld/vaccin_chat_nl_classification.py +46 -0
  75. mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
  76. mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
  77. mteb/tasks/clustering/__init__.py +1 -0
  78. mteb/tasks/clustering/nld/__init__.py +17 -0
  79. mteb/tasks/clustering/nld/dutch_news_articles_clustering_p2p.py +37 -0
  80. mteb/tasks/clustering/nld/dutch_news_articles_clustering_s2s.py +37 -0
  81. mteb/tasks/clustering/nld/iconclass_clustering_s2s.py +47 -0
  82. mteb/tasks/clustering/nld/open_tender_clustering_p2p.py +51 -0
  83. mteb/tasks/clustering/nld/open_tender_clustering_s2s.py +41 -0
  84. mteb/tasks/clustering/nld/vabb_clustering_p2p.py +51 -0
  85. mteb/tasks/clustering/nld/vabb_clustering_s2s.py +51 -0
  86. mteb/tasks/multilabel_classification/__init__.py +1 -0
  87. mteb/tasks/multilabel_classification/nld/__init__.py +9 -0
  88. mteb/tasks/multilabel_classification/nld/covid_disinformation_nl_multi_label_classification.py +88 -0
  89. mteb/tasks/multilabel_classification/nld/vabb_multi_label_classification.py +44 -0
  90. mteb/tasks/pair_classification/__init__.py +1 -0
  91. mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
  92. mteb/tasks/pair_classification/nld/__init__.py +7 -0
  93. mteb/tasks/pair_classification/nld/sick_nl_pair_classification.py +36 -0
  94. mteb/tasks/pair_classification/nld/xlwic_nl_pair_classification.py +41 -0
  95. mteb/tasks/retrieval/code/code_rag.py +8 -8
  96. mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
  97. mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
  98. mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
  99. mteb/tasks/retrieval/eng/__init__.py +18 -4
  100. mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
  101. mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
  102. mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
  103. mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
  104. mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
  105. mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
  106. mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
  107. mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
  108. mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
  109. mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
  110. mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
  111. mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
  112. mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
  113. mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
  114. mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
  115. mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
  116. mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
  117. mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
  118. mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
  119. mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
  120. mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
  121. mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
  122. mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
  123. mteb/tasks/retrieval/nld/__init__.py +10 -0
  124. mteb/tasks/retrieval/nld/bbsard_nl_retrieval.py +41 -0
  125. mteb/tasks/retrieval/nld/dutch_news_articles_retrieval.py +30 -0
  126. mteb/tasks/retrieval/nld/legal_qa_nl_retrieval.py +39 -0
  127. mteb/tasks/retrieval/nld/open_tender_retrieval.py +38 -0
  128. mteb/tasks/retrieval/nld/vabb_retrieval.py +41 -0
  129. mteb/tasks/retrieval/nob/norquad.py +2 -2
  130. mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
  131. mteb/tasks/retrieval/rus/__init__.py +11 -2
  132. mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
  133. mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
  134. mteb/tasks/sts/__init__.py +1 -0
  135. mteb/tasks/sts/nld/__init__.py +5 -0
  136. mteb/tasks/sts/nld/sick_nl_sts.py +41 -0
  137. mteb-2.1.1.dist-info/METADATA +253 -0
  138. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/RECORD +142 -95
  139. mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
  140. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
  141. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
  142. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
  143. mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
  144. mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
  145. mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
  146. mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
  147. mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
  148. mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
  149. mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
  150. mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
  151. mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
  152. mteb-2.0.5.dist-info/METADATA +0 -455
  153. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/WHEEL +0 -0
  154. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/entry_points.txt +0 -0
  155. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/licenses/LICENSE +0 -0
  156. {mteb-2.0.5.dist-info → mteb-2.1.1.dist-info}/top_level.txt +0 -0
@@ -1,183 +1,273 @@
1
1
  {
2
2
  "test": {
3
- "number_of_characters": 1149877,
4
3
  "num_samples": 32000,
5
- "num_queries": 16000,
6
- "num_documents": 16000,
7
- "min_document_length": 0,
8
- "average_document_length": 0,
9
- "max_document_length": 0,
10
- "unique_documents": 0,
11
- "num_document_images": 16000,
12
- "min_query_length": 12,
13
- "average_query_length": 71.8673125,
14
- "max_query_length": 385,
15
- "unique_queries": 15987,
16
- "num_query_images": 0,
17
- "min_relevant_docs_per_query": 1,
18
- "average_relevant_docs_per_query": 1.0,
19
- "max_relevant_docs_per_query": 1,
20
- "unique_relevant_docs": 16000,
4
+ "number_of_characters": 1021877,
5
+ "documents_text_statistics": null,
6
+ "documents_image_statistics": {
7
+ "min_image_width": 176,
8
+ "average_image_width": 514.5045,
9
+ "max_image_width": 640,
10
+ "min_image_height": 144,
11
+ "average_image_height": 444.223,
12
+ "max_image_height": 640,
13
+ "unique_images": 2000
14
+ },
15
+ "queries_text_statistics": {
16
+ "total_text_length": 1021877,
17
+ "min_text_length": 4,
18
+ "average_text_length": 63.8673125,
19
+ "max_text_length": 377,
20
+ "unique_texts": 15986
21
+ },
22
+ "queries_image_statistics": null,
23
+ "relevant_docs_statistics": {
24
+ "num_relevant_docs": 16000,
25
+ "min_relevant_docs_per_query": 1,
26
+ "average_relevant_docs_per_query": 1.0,
27
+ "max_relevant_docs_per_query": 1,
28
+ "unique_relevant_docs": 16000
29
+ },
30
+ "top_ranked_statistics": null,
21
31
  "hf_subset_descriptive_stats": {
22
32
  "de": {
23
- "number_of_characters": 132154,
24
33
  "num_samples": 4000,
25
- "num_queries": 2000,
26
- "num_documents": 2000,
27
- "min_document_length": 0,
28
- "average_document_length": 0,
29
- "max_document_length": 0,
30
- "unique_documents": 0,
31
- "num_document_images": 2000,
32
- "min_query_length": 4,
33
- "average_query_length": 66.077,
34
- "max_query_length": 220,
35
- "unique_queries": 1994,
36
- "num_query_images": 0,
37
- "min_relevant_docs_per_query": 1,
38
- "average_relevant_docs_per_query": 1.0,
39
- "max_relevant_docs_per_query": 1,
40
- "unique_relevant_docs": 2000
34
+ "number_of_characters": 132154,
35
+ "documents_text_statistics": null,
36
+ "documents_image_statistics": {
37
+ "min_image_width": 176,
38
+ "average_image_width": 514.5045,
39
+ "max_image_width": 640,
40
+ "min_image_height": 144,
41
+ "average_image_height": 444.223,
42
+ "max_image_height": 640,
43
+ "unique_images": 2000
44
+ },
45
+ "queries_text_statistics": {
46
+ "total_text_length": 132154,
47
+ "min_text_length": 4,
48
+ "average_text_length": 66.077,
49
+ "max_text_length": 220,
50
+ "unique_texts": 1994
51
+ },
52
+ "queries_image_statistics": null,
53
+ "relevant_docs_statistics": {
54
+ "num_relevant_docs": 2000,
55
+ "min_relevant_docs_per_query": 1,
56
+ "average_relevant_docs_per_query": 1.0,
57
+ "max_relevant_docs_per_query": 1,
58
+ "unique_relevant_docs": 2000
59
+ },
60
+ "top_ranked_statistics": null
41
61
  },
42
62
  "en": {
43
- "number_of_characters": 153801,
44
63
  "num_samples": 4000,
45
- "num_queries": 2000,
46
- "num_documents": 2000,
47
- "min_document_length": 0,
48
- "average_document_length": 0,
49
- "max_document_length": 0,
50
- "unique_documents": 0,
51
- "num_document_images": 2000,
52
- "min_query_length": 34,
53
- "average_query_length": 76.9005,
54
- "max_query_length": 377,
55
- "unique_queries": 2000,
56
- "num_query_images": 0,
57
- "min_relevant_docs_per_query": 1,
58
- "average_relevant_docs_per_query": 1.0,
59
- "max_relevant_docs_per_query": 1,
60
- "unique_relevant_docs": 2000
64
+ "number_of_characters": 153801,
65
+ "documents_text_statistics": null,
66
+ "documents_image_statistics": {
67
+ "min_image_width": 176,
68
+ "average_image_width": 514.5045,
69
+ "max_image_width": 640,
70
+ "min_image_height": 144,
71
+ "average_image_height": 444.223,
72
+ "max_image_height": 640,
73
+ "unique_images": 2000
74
+ },
75
+ "queries_text_statistics": {
76
+ "total_text_length": 153801,
77
+ "min_text_length": 34,
78
+ "average_text_length": 76.9005,
79
+ "max_text_length": 377,
80
+ "unique_texts": 2000
81
+ },
82
+ "queries_image_statistics": null,
83
+ "relevant_docs_statistics": {
84
+ "num_relevant_docs": 2000,
85
+ "min_relevant_docs_per_query": 1,
86
+ "average_relevant_docs_per_query": 1.0,
87
+ "max_relevant_docs_per_query": 1,
88
+ "unique_relevant_docs": 2000
89
+ },
90
+ "top_ranked_statistics": null
61
91
  },
62
92
  "es": {
63
- "number_of_characters": 160049,
64
93
  "num_samples": 4000,
65
- "num_queries": 2000,
66
- "num_documents": 2000,
67
- "min_document_length": 0,
68
- "average_document_length": 0,
69
- "max_document_length": 0,
70
- "unique_documents": 0,
71
- "num_document_images": 2000,
72
- "min_query_length": 23,
73
- "average_query_length": 80.0245,
74
- "max_query_length": 342,
75
- "unique_queries": 2000,
76
- "num_query_images": 0,
77
- "min_relevant_docs_per_query": 1,
78
- "average_relevant_docs_per_query": 1.0,
79
- "max_relevant_docs_per_query": 1,
80
- "unique_relevant_docs": 2000
94
+ "number_of_characters": 160049,
95
+ "documents_text_statistics": null,
96
+ "documents_image_statistics": {
97
+ "min_image_width": 176,
98
+ "average_image_width": 514.5045,
99
+ "max_image_width": 640,
100
+ "min_image_height": 144,
101
+ "average_image_height": 444.223,
102
+ "max_image_height": 640,
103
+ "unique_images": 2000
104
+ },
105
+ "queries_text_statistics": {
106
+ "total_text_length": 160049,
107
+ "min_text_length": 23,
108
+ "average_text_length": 80.0245,
109
+ "max_text_length": 342,
110
+ "unique_texts": 2000
111
+ },
112
+ "queries_image_statistics": null,
113
+ "relevant_docs_statistics": {
114
+ "num_relevant_docs": 2000,
115
+ "min_relevant_docs_per_query": 1,
116
+ "average_relevant_docs_per_query": 1.0,
117
+ "max_relevant_docs_per_query": 1,
118
+ "unique_relevant_docs": 2000
119
+ },
120
+ "top_ranked_statistics": null
81
121
  },
82
122
  "id": {
83
- "number_of_characters": 167858,
84
123
  "num_samples": 4000,
85
- "num_queries": 2000,
86
- "num_documents": 2000,
87
- "min_document_length": 0,
88
- "average_document_length": 0,
89
- "max_document_length": 0,
90
- "unique_documents": 0,
91
- "num_document_images": 2000,
92
- "min_query_length": 4,
93
- "average_query_length": 83.929,
94
- "max_query_length": 211,
95
- "unique_queries": 2000,
96
- "num_query_images": 0,
97
- "min_relevant_docs_per_query": 1,
98
- "average_relevant_docs_per_query": 1.0,
99
- "max_relevant_docs_per_query": 1,
100
- "unique_relevant_docs": 2000
124
+ "number_of_characters": 167858,
125
+ "documents_text_statistics": null,
126
+ "documents_image_statistics": {
127
+ "min_image_width": 176,
128
+ "average_image_width": 514.5045,
129
+ "max_image_width": 640,
130
+ "min_image_height": 144,
131
+ "average_image_height": 444.223,
132
+ "max_image_height": 640,
133
+ "unique_images": 2000
134
+ },
135
+ "queries_text_statistics": {
136
+ "total_text_length": 167858,
137
+ "min_text_length": 4,
138
+ "average_text_length": 83.929,
139
+ "max_text_length": 211,
140
+ "unique_texts": 2000
141
+ },
142
+ "queries_image_statistics": null,
143
+ "relevant_docs_statistics": {
144
+ "num_relevant_docs": 2000,
145
+ "min_relevant_docs_per_query": 1,
146
+ "average_relevant_docs_per_query": 1.0,
147
+ "max_relevant_docs_per_query": 1,
148
+ "unique_relevant_docs": 2000
149
+ },
150
+ "top_ranked_statistics": null
101
151
  },
102
152
  "ja": {
103
- "number_of_characters": 75480,
104
153
  "num_samples": 4000,
105
- "num_queries": 2000,
106
- "num_documents": 2000,
107
- "min_document_length": 0,
108
- "average_document_length": 0,
109
- "max_document_length": 0,
110
- "unique_documents": 0,
111
- "num_document_images": 2000,
112
- "min_query_length": 9,
113
- "average_query_length": 37.74,
114
- "max_query_length": 179,
115
- "unique_queries": 2000,
116
- "num_query_images": 0,
117
- "min_relevant_docs_per_query": 1,
118
- "average_relevant_docs_per_query": 1.0,
119
- "max_relevant_docs_per_query": 1,
120
- "unique_relevant_docs": 2000
154
+ "number_of_characters": 75480,
155
+ "documents_text_statistics": null,
156
+ "documents_image_statistics": {
157
+ "min_image_width": 176,
158
+ "average_image_width": 514.5045,
159
+ "max_image_width": 640,
160
+ "min_image_height": 144,
161
+ "average_image_height": 444.223,
162
+ "max_image_height": 640,
163
+ "unique_images": 2000
164
+ },
165
+ "queries_text_statistics": {
166
+ "total_text_length": 75480,
167
+ "min_text_length": 9,
168
+ "average_text_length": 37.74,
169
+ "max_text_length": 179,
170
+ "unique_texts": 2000
171
+ },
172
+ "queries_image_statistics": null,
173
+ "relevant_docs_statistics": {
174
+ "num_relevant_docs": 2000,
175
+ "min_relevant_docs_per_query": 1,
176
+ "average_relevant_docs_per_query": 1.0,
177
+ "max_relevant_docs_per_query": 1,
178
+ "unique_relevant_docs": 2000
179
+ },
180
+ "top_ranked_statistics": null
121
181
  },
122
182
  "ru": {
123
- "number_of_characters": 149947,
124
183
  "num_samples": 4000,
125
- "num_queries": 2000,
126
- "num_documents": 2000,
127
- "min_document_length": 0,
128
- "average_document_length": 0,
129
- "max_document_length": 0,
130
- "unique_documents": 0,
131
- "num_document_images": 2000,
132
- "min_query_length": 10,
133
- "average_query_length": 74.9735,
134
- "max_query_length": 294,
135
- "unique_queries": 1997,
136
- "num_query_images": 0,
137
- "min_relevant_docs_per_query": 1,
138
- "average_relevant_docs_per_query": 1.0,
139
- "max_relevant_docs_per_query": 1,
140
- "unique_relevant_docs": 2000
184
+ "number_of_characters": 149947,
185
+ "documents_text_statistics": null,
186
+ "documents_image_statistics": {
187
+ "min_image_width": 176,
188
+ "average_image_width": 514.5045,
189
+ "max_image_width": 640,
190
+ "min_image_height": 144,
191
+ "average_image_height": 444.223,
192
+ "max_image_height": 640,
193
+ "unique_images": 2000
194
+ },
195
+ "queries_text_statistics": {
196
+ "total_text_length": 149947,
197
+ "min_text_length": 10,
198
+ "average_text_length": 74.9735,
199
+ "max_text_length": 294,
200
+ "unique_texts": 1997
201
+ },
202
+ "queries_image_statistics": null,
203
+ "relevant_docs_statistics": {
204
+ "num_relevant_docs": 2000,
205
+ "min_relevant_docs_per_query": 1,
206
+ "average_relevant_docs_per_query": 1.0,
207
+ "max_relevant_docs_per_query": 1,
208
+ "unique_relevant_docs": 2000
209
+ },
210
+ "top_ranked_statistics": null
141
211
  },
142
212
  "tr": {
143
- "number_of_characters": 136134,
144
213
  "num_samples": 4000,
145
- "num_queries": 2000,
146
- "num_documents": 2000,
147
- "min_document_length": 0,
148
- "average_document_length": 0,
149
- "max_document_length": 0,
150
- "unique_documents": 0,
151
- "num_document_images": 2000,
152
- "min_query_length": 19,
153
- "average_query_length": 68.067,
154
- "max_query_length": 199,
155
- "unique_queries": 1997,
156
- "num_query_images": 0,
157
- "min_relevant_docs_per_query": 1,
158
- "average_relevant_docs_per_query": 1.0,
159
- "max_relevant_docs_per_query": 1,
160
- "unique_relevant_docs": 2000
214
+ "number_of_characters": 136134,
215
+ "documents_text_statistics": null,
216
+ "documents_image_statistics": {
217
+ "min_image_width": 176,
218
+ "average_image_width": 514.5045,
219
+ "max_image_width": 640,
220
+ "min_image_height": 144,
221
+ "average_image_height": 444.223,
222
+ "max_image_height": 640,
223
+ "unique_images": 2000
224
+ },
225
+ "queries_text_statistics": {
226
+ "total_text_length": 136134,
227
+ "min_text_length": 19,
228
+ "average_text_length": 68.067,
229
+ "max_text_length": 199,
230
+ "unique_texts": 1997
231
+ },
232
+ "queries_image_statistics": null,
233
+ "relevant_docs_statistics": {
234
+ "num_relevant_docs": 2000,
235
+ "min_relevant_docs_per_query": 1,
236
+ "average_relevant_docs_per_query": 1.0,
237
+ "max_relevant_docs_per_query": 1,
238
+ "unique_relevant_docs": 2000
239
+ },
240
+ "top_ranked_statistics": null
161
241
  },
162
242
  "zh": {
163
- "number_of_characters": 46454,
164
243
  "num_samples": 4000,
165
- "num_queries": 2000,
166
- "num_documents": 2000,
167
- "min_document_length": 0,
168
- "average_document_length": 0,
169
- "max_document_length": 0,
170
- "unique_documents": 0,
171
- "num_document_images": 2000,
172
- "min_query_length": 10,
173
- "average_query_length": 23.227,
174
- "max_query_length": 66,
175
- "unique_queries": 1999,
176
- "num_query_images": 0,
177
- "min_relevant_docs_per_query": 1,
178
- "average_relevant_docs_per_query": 1.0,
179
- "max_relevant_docs_per_query": 1,
180
- "unique_relevant_docs": 2000
244
+ "number_of_characters": 46454,
245
+ "documents_text_statistics": null,
246
+ "documents_image_statistics": {
247
+ "min_image_width": 176,
248
+ "average_image_width": 514.5045,
249
+ "max_image_width": 640,
250
+ "min_image_height": 144,
251
+ "average_image_height": 444.223,
252
+ "max_image_height": 640,
253
+ "unique_images": 2000
254
+ },
255
+ "queries_text_statistics": {
256
+ "total_text_length": 46454,
257
+ "min_text_length": 10,
258
+ "average_text_length": 23.227,
259
+ "max_text_length": 66,
260
+ "unique_texts": 1999
261
+ },
262
+ "queries_image_statistics": null,
263
+ "relevant_docs_statistics": {
264
+ "num_relevant_docs": 2000,
265
+ "min_relevant_docs_per_query": 1,
266
+ "average_relevant_docs_per_query": 1.0,
267
+ "max_relevant_docs_per_query": 1,
268
+ "unique_relevant_docs": 2000
269
+ },
270
+ "top_ranked_statistics": null
181
271
  }
182
272
  }
183
273
  }