mteb 2.0.4__py3-none-any.whl → 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. mteb/descriptive_stats/BitextMining/BUCC.json +70 -40
  2. mteb/descriptive_stats/Classification/DKHateClassification.json +40 -24
  3. mteb/descriptive_stats/Classification/FinancialPhrasebankClassification.json +23 -15
  4. mteb/descriptive_stats/Classification/ImdbClassification.json +40 -24
  5. mteb/descriptive_stats/Classification/KorHateClassification.json +23 -15
  6. mteb/descriptive_stats/Clustering/ArxivClusteringP2P.json +555 -550
  7. mteb/descriptive_stats/Clustering/ArxivClusteringP2P.v2.json +546 -541
  8. mteb/descriptive_stats/Clustering/ArxivClusteringS2S.json +555 -550
  9. mteb/descriptive_stats/Clustering/MLSUMClusteringP2P.json +2466 -2416
  10. mteb/descriptive_stats/Clustering/RedditClusteringP2P.json +1365 -1360
  11. mteb/descriptive_stats/Clustering/SNLClustering.json +378 -373
  12. mteb/descriptive_stats/Clustering/SwednClustering.json +28 -23
  13. mteb/descriptive_stats/Clustering/VGClustering.json +54 -49
  14. mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/WITT2IRetrieval.json +324 -204
  15. mteb/descriptive_stats/Image/Any2AnyRetrieval/MemotionI2TRetrieval.json +28 -18
  16. mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRAirbnbSyntheticRetrieval.json +334 -0
  17. mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRGitHubReadmeRetrieval.json +544 -0
  18. mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRTweetStockSyntheticsRetrieval.json +334 -0
  19. mteb/descriptive_stats/Image/DocumentUnderstanding/JinaVDRWikimediaCommonsDocumentsRetrieval.json +634 -0
  20. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore2ESGReportsRetrieval.json +154 -0
  21. mteb/descriptive_stats/Image/DocumentUnderstanding/Vidore2EconomicsReportsRetrieval.json +154 -0
  22. mteb/descriptive_stats/Image/ImageClassification/Imagenet1k.json +6039 -3007
  23. mteb/descriptive_stats/Image/ZeroShotClassification/Imagenet1kZeroShot.json +3024 -3010
  24. mteb/descriptive_stats/Image/ZeroShotClassification/PatchCamelyonZeroShot.json +30 -16
  25. mteb/descriptive_stats/Reranking/MIRACLReranking.json +555 -479
  26. mteb/descriptive_stats/Reranking/MindSmallReranking.json +29 -25
  27. mteb/descriptive_stats/Retrieval/AlloprofRetrieval.json +25 -26
  28. mteb/descriptive_stats/Retrieval/Code1Retrieval.json +30 -0
  29. mteb/descriptive_stats/Retrieval/DanFEVER.json +25 -26
  30. mteb/descriptive_stats/Retrieval/EnglishFinance1Retrieval.json +30 -0
  31. mteb/descriptive_stats/Retrieval/EnglishFinance2Retrieval.json +30 -0
  32. mteb/descriptive_stats/Retrieval/EnglishFinance3Retrieval.json +30 -0
  33. mteb/descriptive_stats/Retrieval/EnglishFinance4Retrieval.json +30 -0
  34. mteb/descriptive_stats/Retrieval/EnglishHealthcare1Retrieval.json +30 -0
  35. mteb/descriptive_stats/Retrieval/French1Retrieval.json +30 -0
  36. mteb/descriptive_stats/Retrieval/FrenchLegal1Retrieval.json +30 -0
  37. mteb/descriptive_stats/Retrieval/German1Retrieval.json +30 -0
  38. mteb/descriptive_stats/Retrieval/GermanHealthcare1Retrieval.json +30 -0
  39. mteb/descriptive_stats/Retrieval/GermanLegal1Retrieval.json +30 -0
  40. mteb/descriptive_stats/Retrieval/JapaneseCode1Retrieval.json +30 -0
  41. mteb/descriptive_stats/Retrieval/JapaneseLegal1Retrieval.json +30 -0
  42. mteb/descriptive_stats/Retrieval/MIRACLRetrieval.json +475 -494
  43. mteb/descriptive_stats/Retrieval/MSMARCO-Fa.json +25 -26
  44. mteb/descriptive_stats/Retrieval/MSMARCO.json +25 -84
  45. mteb/descriptive_stats/Retrieval/Touche2020.json +25 -26
  46. mteb/descriptive_stats/Summarization/SummEval.json +27 -50
  47. mteb/descriptive_stats/Summarization/SummEvalFr.json +27 -50
  48. mteb/models/model_implementations/kalm_models.py +29 -0
  49. mteb/tasks/bitext_mining/multilingual/bucc_bitext_mining.py +1 -1
  50. mteb/tasks/classification/eng/financial_phrasebank_classification.py +0 -3
  51. mteb/tasks/classification/kor/kor_hate_classification.py +0 -12
  52. mteb/tasks/clustering/swe/swedn_clustering.py +2 -2
  53. mteb/tasks/retrieval/multilingual/vdr_multilingual_retrieval.py +1 -1
  54. {mteb-2.0.4.dist-info → mteb-2.0.5.dist-info}/METADATA +1 -1
  55. {mteb-2.0.4.dist-info → mteb-2.0.5.dist-info}/RECORD +59 -40
  56. {mteb-2.0.4.dist-info → mteb-2.0.5.dist-info}/WHEEL +0 -0
  57. {mteb-2.0.4.dist-info → mteb-2.0.5.dist-info}/entry_points.txt +0 -0
  58. {mteb-2.0.4.dist-info → mteb-2.0.5.dist-info}/licenses/LICENSE +0 -0
  59. {mteb-2.0.4.dist-info → mteb-2.0.5.dist-info}/top_level.txt +0 -0
@@ -1,554 +1,535 @@
1
1
  {
2
2
  "dev": {
3
3
  "num_samples": 106345647,
4
- "number_of_characters": 37176781172,
5
- "num_documents": 106332152,
6
- "min_document_length": 2,
7
- "average_document_length": 349.6241542163089,
8
- "max_document_length": 84930,
9
- "unique_documents": 106332152,
10
- "num_queries": 13495,
11
- "min_query_length": 5,
12
- "average_query_length": 36.49225639125602,
13
- "max_query_length": 176,
14
- "unique_queries": 13495,
15
- "none_queries": 0,
16
- "num_relevant_docs": 130408,
17
- "min_relevant_docs_per_query": 1,
18
- "average_relevant_docs_per_query": 2.3059651722860317,
19
- "max_relevant_docs_per_query": 20,
20
- "unique_relevant_docs": 119924,
21
- "num_instructions": null,
22
- "min_instruction_length": null,
23
- "average_instruction_length": null,
24
- "max_instruction_length": null,
25
- "unique_instructions": null,
26
- "num_top_ranked": null,
27
- "min_top_ranked_per_query": null,
28
- "average_top_ranked_per_query": null,
29
- "max_top_ranked_per_query": null,
4
+ "number_of_characters": 37283113324,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 37282620861,
7
+ "min_text_length": 3,
8
+ "average_text_length": 350.6241542163089,
9
+ "max_text_length": 84931,
10
+ "unique_texts": 106030398
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 492463,
15
+ "min_text_length": 5,
16
+ "average_text_length": 36.49225639125602,
17
+ "max_text_length": 176,
18
+ "unique_texts": 13490
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 31119,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 2.3059651722860317,
25
+ "max_relevant_docs_per_query": 20,
26
+ "unique_relevant_docs": 119924
27
+ },
28
+ "top_ranked_statistics": null,
30
29
  "hf_subset_descriptive_stats": {
31
30
  "ar": {
32
31
  "num_samples": 2064310,
33
- "number_of_characters": 656963110,
34
- "num_documents": 2061414,
35
- "min_document_length": 4,
36
- "average_document_length": 318.6539598547405,
37
- "max_document_length": 48550,
38
- "unique_documents": 2061414,
39
- "num_queries": 2896,
40
- "min_query_length": 12,
41
- "average_query_length": 29.480662983425415,
42
- "max_query_length": 101,
43
- "unique_queries": 2896,
44
- "none_queries": 0,
45
- "num_relevant_docs": 29197,
46
- "min_relevant_docs_per_query": 7,
47
- "average_relevant_docs_per_query": 1.953729281767956,
48
- "max_relevant_docs_per_query": 17,
49
- "unique_relevant_docs": 25881,
50
- "num_instructions": null,
51
- "min_instruction_length": null,
52
- "average_instruction_length": null,
53
- "max_instruction_length": null,
54
- "unique_instructions": null,
55
- "num_top_ranked": null,
56
- "min_top_ranked_per_query": null,
57
- "average_top_ranked_per_query": null,
58
- "max_top_ranked_per_query": null
32
+ "number_of_characters": 659024524,
33
+ "documents_text_statistics": {
34
+ "total_text_length": 658939148,
35
+ "min_text_length": 5,
36
+ "average_text_length": 319.6539598547405,
37
+ "max_text_length": 48551,
38
+ "unique_texts": 2056619
39
+ },
40
+ "documents_image_statistics": null,
41
+ "queries_text_statistics": {
42
+ "total_text_length": 85376,
43
+ "min_text_length": 12,
44
+ "average_text_length": 29.480662983425415,
45
+ "max_text_length": 101,
46
+ "unique_texts": 2896
47
+ },
48
+ "queries_image_statistics": null,
49
+ "relevant_docs_statistics": {
50
+ "num_relevant_docs": 5658,
51
+ "min_relevant_docs_per_query": 7,
52
+ "average_relevant_docs_per_query": 1.953729281767956,
53
+ "max_relevant_docs_per_query": 17,
54
+ "unique_relevant_docs": 25881
55
+ },
56
+ "top_ranked_statistics": null
59
57
  },
60
58
  "bn": {
61
59
  "num_samples": 297676,
62
- "number_of_characters": 113943984,
63
- "num_documents": 297265,
64
- "min_document_length": 3,
65
- "average_document_length": 383.2428136511194,
66
- "max_document_length": 17108,
67
- "unique_documents": 297265,
68
- "num_queries": 411,
69
- "min_query_length": 16,
70
- "average_query_length": 46.98053527980535,
71
- "max_query_length": 112,
72
- "unique_queries": 411,
73
- "none_queries": 0,
74
- "num_relevant_docs": 4206,
75
- "min_relevant_docs_per_query": 7,
76
- "average_relevant_docs_per_query": 2.099756690997567,
77
- "max_relevant_docs_per_query": 13,
78
- "unique_relevant_docs": 3729,
79
- "num_instructions": null,
80
- "min_instruction_length": null,
81
- "average_instruction_length": null,
82
- "max_instruction_length": null,
83
- "unique_instructions": null,
84
- "num_top_ranked": null,
85
- "min_top_ranked_per_query": null,
86
- "average_top_ranked_per_query": null,
87
- "max_top_ranked_per_query": null
60
+ "number_of_characters": 114241249,
61
+ "documents_text_statistics": {
62
+ "total_text_length": 114221940,
63
+ "min_text_length": 4,
64
+ "average_text_length": 384.2428136511194,
65
+ "max_text_length": 17109,
66
+ "unique_texts": 296877
67
+ },
68
+ "documents_image_statistics": null,
69
+ "queries_text_statistics": {
70
+ "total_text_length": 19309,
71
+ "min_text_length": 16,
72
+ "average_text_length": 46.98053527980535,
73
+ "max_text_length": 112,
74
+ "unique_texts": 411
75
+ },
76
+ "queries_image_statistics": null,
77
+ "relevant_docs_statistics": {
78
+ "num_relevant_docs": 863,
79
+ "min_relevant_docs_per_query": 7,
80
+ "average_relevant_docs_per_query": 2.099756690997567,
81
+ "max_relevant_docs_per_query": 13,
82
+ "unique_relevant_docs": 3729
83
+ },
84
+ "top_ranked_statistics": null
88
85
  },
89
86
  "de": {
90
87
  "num_samples": 15866527,
91
- "number_of_characters": 6573073185,
92
- "num_documents": 15866222,
93
- "min_document_length": 4,
94
- "average_document_length": 414.28004442393404,
95
- "max_document_length": 64968,
96
- "unique_documents": 15866222,
97
- "num_queries": 305,
98
- "min_query_length": 15,
99
- "average_query_length": 46.0,
100
- "max_query_length": 87,
101
- "unique_queries": 305,
102
- "none_queries": 0,
103
- "num_relevant_docs": 3144,
104
- "min_relevant_docs_per_query": 9,
105
- "average_relevant_docs_per_query": 2.6590163934426227,
106
- "max_relevant_docs_per_query": 20,
107
- "unique_relevant_docs": 3103,
108
- "num_instructions": null,
109
- "min_instruction_length": null,
110
- "average_instruction_length": null,
111
- "max_instruction_length": null,
112
- "unique_instructions": null,
113
- "num_top_ranked": null,
114
- "min_top_ranked_per_query": null,
115
- "average_top_ranked_per_query": null,
116
- "max_top_ranked_per_query": null
88
+ "number_of_characters": 6588939407,
89
+ "documents_text_statistics": {
90
+ "total_text_length": 6588925377,
91
+ "min_text_length": 5,
92
+ "average_text_length": 415.28004442393404,
93
+ "max_text_length": 64969,
94
+ "unique_texts": 15822967
95
+ },
96
+ "documents_image_statistics": null,
97
+ "queries_text_statistics": {
98
+ "total_text_length": 14030,
99
+ "min_text_length": 15,
100
+ "average_text_length": 46.0,
101
+ "max_text_length": 87,
102
+ "unique_texts": 304
103
+ },
104
+ "queries_image_statistics": null,
105
+ "relevant_docs_statistics": {
106
+ "num_relevant_docs": 811,
107
+ "min_relevant_docs_per_query": 9,
108
+ "average_relevant_docs_per_query": 2.6590163934426227,
109
+ "max_relevant_docs_per_query": 20,
110
+ "unique_relevant_docs": 3103
111
+ },
112
+ "top_ranked_statistics": null
117
113
  },
118
114
  "en": {
119
115
  "num_samples": 32894020,
120
- "number_of_characters": 13190354940,
121
- "num_documents": 32893221,
122
- "min_document_length": 3,
123
- "average_document_length": 401.0042914921588,
124
- "max_document_length": 36471,
125
- "unique_documents": 32893221,
126
- "num_queries": 799,
127
- "min_query_length": 16,
128
- "average_query_length": 40.247809762202756,
129
- "max_query_length": 122,
130
- "unique_queries": 799,
131
- "none_queries": 0,
132
- "num_relevant_docs": 8350,
133
- "min_relevant_docs_per_query": 9,
134
- "average_relevant_docs_per_query": 2.911138923654568,
135
- "max_relevant_docs_per_query": 16,
136
- "unique_relevant_docs": 7921,
137
- "num_instructions": null,
138
- "min_instruction_length": null,
139
- "average_instruction_length": null,
140
- "max_instruction_length": null,
141
- "unique_instructions": null,
142
- "num_top_ranked": null,
143
- "min_top_ranked_per_query": null,
144
- "average_top_ranked_per_query": null,
145
- "max_top_ranked_per_query": null
116
+ "number_of_characters": 13223248161,
117
+ "documents_text_statistics": {
118
+ "total_text_length": 13223216003,
119
+ "min_text_length": 4,
120
+ "average_text_length": 402.0042914921588,
121
+ "max_text_length": 36472,
122
+ "unique_texts": 32779516
123
+ },
124
+ "documents_image_statistics": null,
125
+ "queries_text_statistics": {
126
+ "total_text_length": 32158,
127
+ "min_text_length": 16,
128
+ "average_text_length": 40.247809762202756,
129
+ "max_text_length": 122,
130
+ "unique_texts": 799
131
+ },
132
+ "queries_image_statistics": null,
133
+ "relevant_docs_statistics": {
134
+ "num_relevant_docs": 2326,
135
+ "min_relevant_docs_per_query": 9,
136
+ "average_relevant_docs_per_query": 2.911138923654568,
137
+ "max_relevant_docs_per_query": 16,
138
+ "unique_relevant_docs": 7921
139
+ },
140
+ "top_ranked_statistics": null
146
141
  },
147
142
  "es": {
148
143
  "num_samples": 10374601,
149
- "number_of_characters": 4188115187,
150
- "num_documents": 10373953,
151
- "min_document_length": 2,
152
- "average_document_length": 403.71153493754986,
153
- "max_document_length": 57012,
154
- "unique_documents": 10373953,
155
- "num_queries": 648,
156
- "min_query_length": 19,
157
- "average_query_length": 47.373456790123456,
158
- "max_query_length": 88,
159
- "unique_queries": 648,
160
- "none_queries": 0,
161
- "num_relevant_docs": 6443,
162
- "min_relevant_docs_per_query": 2,
163
- "average_relevant_docs_per_query": 4.609567901234568,
164
- "max_relevant_docs_per_query": 10,
165
- "unique_relevant_docs": 6410,
166
- "num_instructions": null,
167
- "min_instruction_length": null,
168
- "average_instruction_length": null,
169
- "max_instruction_length": null,
170
- "unique_instructions": null,
171
- "num_top_ranked": null,
172
- "min_top_ranked_per_query": null,
173
- "average_top_ranked_per_query": null,
174
- "max_top_ranked_per_query": null
144
+ "number_of_characters": 4198489140,
145
+ "documents_text_statistics": {
146
+ "total_text_length": 4198458442,
147
+ "min_text_length": 3,
148
+ "average_text_length": 404.71153493754986,
149
+ "max_text_length": 57013,
150
+ "unique_texts": 10356641
151
+ },
152
+ "documents_image_statistics": null,
153
+ "queries_text_statistics": {
154
+ "total_text_length": 30698,
155
+ "min_text_length": 19,
156
+ "average_text_length": 47.373456790123456,
157
+ "max_text_length": 88,
158
+ "unique_texts": 648
159
+ },
160
+ "queries_image_statistics": null,
161
+ "relevant_docs_statistics": {
162
+ "num_relevant_docs": 2987,
163
+ "min_relevant_docs_per_query": 2,
164
+ "average_relevant_docs_per_query": 4.609567901234568,
165
+ "max_relevant_docs_per_query": 10,
166
+ "unique_relevant_docs": 6410
167
+ },
168
+ "top_ranked_statistics": null
175
169
  },
176
170
  "fa": {
177
171
  "num_samples": 2207804,
178
- "number_of_characters": 579734962,
179
- "num_documents": 2207172,
180
- "min_document_length": 4,
181
- "average_document_length": 262.6478385010321,
182
- "max_document_length": 36495,
183
- "unique_documents": 2207172,
184
- "num_queries": 632,
185
- "min_query_length": 18,
186
- "average_query_length": 41.1503164556962,
187
- "max_query_length": 82,
188
- "unique_queries": 632,
189
- "none_queries": 0,
190
- "num_relevant_docs": 6571,
191
- "min_relevant_docs_per_query": 9,
192
- "average_relevant_docs_per_query": 2.079113924050633,
193
- "max_relevant_docs_per_query": 20,
194
- "unique_relevant_docs": 6405,
195
- "num_instructions": null,
196
- "min_instruction_length": null,
197
- "average_instruction_length": null,
198
- "max_instruction_length": null,
199
- "unique_instructions": null,
200
- "num_top_ranked": null,
201
- "min_top_ranked_per_query": null,
202
- "average_top_ranked_per_query": null,
203
- "max_top_ranked_per_query": null
172
+ "number_of_characters": 581942134,
173
+ "documents_text_statistics": {
174
+ "total_text_length": 581916127,
175
+ "min_text_length": 5,
176
+ "average_text_length": 263.6478385010321,
177
+ "max_text_length": 36496,
178
+ "unique_texts": 2203629
179
+ },
180
+ "documents_image_statistics": null,
181
+ "queries_text_statistics": {
182
+ "total_text_length": 26007,
183
+ "min_text_length": 18,
184
+ "average_text_length": 41.1503164556962,
185
+ "max_text_length": 82,
186
+ "unique_texts": 631
187
+ },
188
+ "queries_image_statistics": null,
189
+ "relevant_docs_statistics": {
190
+ "num_relevant_docs": 1314,
191
+ "min_relevant_docs_per_query": 9,
192
+ "average_relevant_docs_per_query": 2.079113924050633,
193
+ "max_relevant_docs_per_query": 20,
194
+ "unique_relevant_docs": 6405
195
+ },
196
+ "top_ranked_statistics": null
204
197
  },
205
198
  "fi": {
206
199
  "num_samples": 1884780,
207
- "number_of_characters": 677881948,
208
- "num_documents": 1883509,
209
- "min_document_length": 4,
210
- "average_document_length": 359.87767671935734,
211
- "max_document_length": 11578,
212
- "unique_documents": 1883509,
213
- "num_queries": 1271,
214
- "min_query_length": 14,
215
- "average_query_length": 38.63493312352478,
216
- "max_query_length": 130,
217
- "unique_queries": 1271,
218
- "none_queries": 0,
219
- "num_relevant_docs": 12008,
220
- "min_relevant_docs_per_query": 1,
221
- "average_relevant_docs_per_query": 1.925255704169945,
222
- "max_relevant_docs_per_query": 16,
223
- "unique_relevant_docs": 11365,
224
- "num_instructions": null,
225
- "min_instruction_length": null,
226
- "average_instruction_length": null,
227
- "max_instruction_length": null,
228
- "unique_instructions": null,
229
- "num_top_ranked": null,
230
- "min_top_ranked_per_query": null,
231
- "average_top_ranked_per_query": null,
232
- "max_top_ranked_per_query": null
200
+ "number_of_characters": 679765457,
201
+ "documents_text_statistics": {
202
+ "total_text_length": 679716352,
203
+ "min_text_length": 5,
204
+ "average_text_length": 360.87767671935734,
205
+ "max_text_length": 11579,
206
+ "unique_texts": 1880731
207
+ },
208
+ "documents_image_statistics": null,
209
+ "queries_text_statistics": {
210
+ "total_text_length": 49105,
211
+ "min_text_length": 14,
212
+ "average_text_length": 38.63493312352478,
213
+ "max_text_length": 130,
214
+ "unique_texts": 1271
215
+ },
216
+ "queries_image_statistics": null,
217
+ "relevant_docs_statistics": {
218
+ "num_relevant_docs": 2447,
219
+ "min_relevant_docs_per_query": 1,
220
+ "average_relevant_docs_per_query": 1.925255704169945,
221
+ "max_relevant_docs_per_query": 16,
222
+ "unique_relevant_docs": 11365
223
+ },
224
+ "top_ranked_statistics": null
233
225
  },
234
226
  "fr": {
235
227
  "num_samples": 14637296,
236
- "number_of_characters": 5029687134,
237
- "num_documents": 14636953,
238
- "min_document_length": 3,
239
- "average_document_length": 343.6283550271699,
240
- "max_document_length": 52638,
241
- "unique_documents": 14636953,
242
- "num_queries": 343,
243
- "min_query_length": 16,
244
- "average_query_length": 43.883381924198254,
245
- "max_query_length": 83,
246
- "unique_queries": 343,
247
- "none_queries": 0,
248
- "num_relevant_docs": 3429,
249
- "min_relevant_docs_per_query": 9,
250
- "average_relevant_docs_per_query": 2.131195335276968,
251
- "max_relevant_docs_per_query": 10,
252
- "unique_relevant_docs": 3407,
253
- "num_instructions": null,
254
- "min_instruction_length": null,
255
- "average_instruction_length": null,
256
- "max_instruction_length": null,
257
- "unique_instructions": null,
258
- "num_top_ranked": null,
259
- "min_top_ranked_per_query": null,
260
- "average_top_ranked_per_query": null,
261
- "max_top_ranked_per_query": null
228
+ "number_of_characters": 5044324087,
229
+ "documents_text_statistics": {
230
+ "total_text_length": 5044309035,
231
+ "min_text_length": 4,
232
+ "average_text_length": 344.6283550271699,
233
+ "max_text_length": 52639,
234
+ "unique_texts": 14595458
235
+ },
236
+ "documents_image_statistics": null,
237
+ "queries_text_statistics": {
238
+ "total_text_length": 15052,
239
+ "min_text_length": 16,
240
+ "average_text_length": 43.883381924198254,
241
+ "max_text_length": 83,
242
+ "unique_texts": 343
243
+ },
244
+ "queries_image_statistics": null,
245
+ "relevant_docs_statistics": {
246
+ "num_relevant_docs": 731,
247
+ "min_relevant_docs_per_query": 9,
248
+ "average_relevant_docs_per_query": 2.131195335276968,
249
+ "max_relevant_docs_per_query": 10,
250
+ "unique_relevant_docs": 3407
251
+ },
252
+ "top_ranked_statistics": null
262
253
  },
263
254
  "hi": {
264
255
  "num_samples": 506614,
265
- "number_of_characters": 187823359,
266
- "num_documents": 506264,
267
- "min_document_length": 2,
268
- "average_document_length": 370.96196845914386,
269
- "max_document_length": 44769,
270
- "unique_documents": 506264,
271
- "num_queries": 350,
272
- "min_query_length": 24,
273
- "average_query_length": 53.34,
274
- "max_query_length": 120,
275
- "unique_queries": 350,
276
- "none_queries": 0,
277
- "num_relevant_docs": 3494,
278
- "min_relevant_docs_per_query": 6,
279
- "average_relevant_docs_per_query": 2.1485714285714286,
280
- "max_relevant_docs_per_query": 10,
281
- "unique_relevant_docs": 3342,
282
- "num_instructions": null,
283
- "min_instruction_length": null,
284
- "average_instruction_length": null,
285
- "max_instruction_length": null,
286
- "unique_instructions": null,
287
- "num_top_ranked": null,
288
- "min_top_ranked_per_query": null,
289
- "average_top_ranked_per_query": null,
290
- "max_top_ranked_per_query": null
256
+ "number_of_characters": 188329623,
257
+ "documents_text_statistics": {
258
+ "total_text_length": 188310954,
259
+ "min_text_length": 3,
260
+ "average_text_length": 371.96196845914386,
261
+ "max_text_length": 44770,
262
+ "unique_texts": 504881
263
+ },
264
+ "documents_image_statistics": null,
265
+ "queries_text_statistics": {
266
+ "total_text_length": 18669,
267
+ "min_text_length": 24,
268
+ "average_text_length": 53.34,
269
+ "max_text_length": 120,
270
+ "unique_texts": 350
271
+ },
272
+ "queries_image_statistics": null,
273
+ "relevant_docs_statistics": {
274
+ "num_relevant_docs": 752,
275
+ "min_relevant_docs_per_query": 6,
276
+ "average_relevant_docs_per_query": 2.1485714285714286,
277
+ "max_relevant_docs_per_query": 10,
278
+ "unique_relevant_docs": 3342
279
+ },
280
+ "top_ranked_statistics": null
291
281
  },
292
282
  "id": {
293
283
  "num_samples": 1447275,
294
- "number_of_characters": 506649583,
295
- "num_documents": 1446315,
296
- "min_document_length": 4,
297
- "average_document_length": 350.2785651811673,
298
- "max_document_length": 39539,
299
- "unique_documents": 1446315,
300
- "num_queries": 960,
301
- "min_query_length": 13,
302
- "average_query_length": 37.958333333333336,
303
- "max_query_length": 93,
304
- "unique_queries": 960,
305
- "none_queries": 0,
306
- "num_relevant_docs": 9668,
307
- "min_relevant_docs_per_query": 2,
308
- "average_relevant_docs_per_query": 3.216666666666667,
309
- "max_relevant_docs_per_query": 17,
310
- "unique_relevant_docs": 8286,
311
- "num_instructions": null,
312
- "min_instruction_length": null,
313
- "average_instruction_length": null,
314
- "max_instruction_length": null,
315
- "unique_instructions": null,
316
- "num_top_ranked": null,
317
- "min_top_ranked_per_query": null,
318
- "average_top_ranked_per_query": null,
319
- "max_top_ranked_per_query": null
284
+ "number_of_characters": 508095898,
285
+ "documents_text_statistics": {
286
+ "total_text_length": 508059458,
287
+ "min_text_length": 5,
288
+ "average_text_length": 351.2785651811673,
289
+ "max_text_length": 39540,
290
+ "unique_texts": 1441163
291
+ },
292
+ "documents_image_statistics": null,
293
+ "queries_text_statistics": {
294
+ "total_text_length": 36440,
295
+ "min_text_length": 13,
296
+ "average_text_length": 37.958333333333336,
297
+ "max_text_length": 93,
298
+ "unique_texts": 960
299
+ },
300
+ "queries_image_statistics": null,
301
+ "relevant_docs_statistics": {
302
+ "num_relevant_docs": 3088,
303
+ "min_relevant_docs_per_query": 2,
304
+ "average_relevant_docs_per_query": 3.216666666666667,
305
+ "max_relevant_docs_per_query": 17,
306
+ "unique_relevant_docs": 8286
307
+ },
308
+ "top_ranked_statistics": null
320
309
  },
321
310
  "ja": {
322
311
  "num_samples": 6954474,
323
- "number_of_characters": 1014226413,
324
- "num_documents": 6953614,
325
- "min_document_length": 2,
326
- "average_document_length": 145.8538220556965,
327
- "max_document_length": 25236,
328
- "unique_documents": 6953614,
329
- "num_queries": 860,
330
- "min_query_length": 7,
331
- "average_query_length": 17.71395348837209,
332
- "max_query_length": 48,
333
- "unique_queries": 860,
334
- "none_queries": 0,
335
- "num_relevant_docs": 8354,
336
- "min_relevant_docs_per_query": 1,
337
- "average_relevant_docs_per_query": 2.0813953488372094,
338
- "max_relevant_docs_per_query": 16,
339
- "unique_relevant_docs": 8066,
340
- "num_instructions": null,
341
- "min_instruction_length": null,
342
- "average_instruction_length": null,
343
- "max_instruction_length": null,
344
- "unique_instructions": null,
345
- "num_top_ranked": null,
346
- "min_top_ranked_per_query": null,
347
- "average_top_ranked_per_query": null,
348
- "max_top_ranked_per_query": null
312
+ "number_of_characters": 1021180027,
313
+ "documents_text_statistics": {
314
+ "total_text_length": 1021164793,
315
+ "min_text_length": 3,
316
+ "average_text_length": 146.8538220556965,
317
+ "max_text_length": 25237,
318
+ "unique_texts": 6935029
319
+ },
320
+ "documents_image_statistics": null,
321
+ "queries_text_statistics": {
322
+ "total_text_length": 15234,
323
+ "min_text_length": 7,
324
+ "average_text_length": 17.71395348837209,
325
+ "max_text_length": 48,
326
+ "unique_texts": 860
327
+ },
328
+ "queries_image_statistics": null,
329
+ "relevant_docs_statistics": {
330
+ "num_relevant_docs": 1790,
331
+ "min_relevant_docs_per_query": 1,
332
+ "average_relevant_docs_per_query": 2.0813953488372094,
333
+ "max_relevant_docs_per_query": 16,
334
+ "unique_relevant_docs": 8066
335
+ },
336
+ "top_ranked_statistics": null
349
337
  },
350
338
  "ko": {
351
339
  "num_samples": 1486965,
352
- "number_of_characters": 258664503,
353
- "num_documents": 1486752,
354
- "min_document_length": 3,
355
- "average_document_length": 173.97649170809927,
356
- "max_document_length": 25246,
357
- "unique_documents": 1486752,
358
- "num_queries": 213,
359
- "min_query_length": 5,
360
- "average_query_length": 21.624413145539908,
361
- "max_query_length": 92,
362
- "unique_queries": 213,
363
- "none_queries": 0,
364
- "num_relevant_docs": 3057,
365
- "min_relevant_docs_per_query": 9,
366
- "average_relevant_docs_per_query": 2.568075117370892,
367
- "max_relevant_docs_per_query": 20,
368
- "unique_relevant_docs": 2835,
369
- "num_instructions": null,
370
- "min_instruction_length": null,
371
- "average_instruction_length": null,
372
- "max_instruction_length": null,
373
- "unique_instructions": null,
374
- "num_top_ranked": null,
375
- "min_top_ranked_per_query": null,
376
- "average_top_ranked_per_query": null,
377
- "max_top_ranked_per_query": null
340
+ "number_of_characters": 260151255,
341
+ "documents_text_statistics": {
342
+ "total_text_length": 260146649,
343
+ "min_text_length": 4,
344
+ "average_text_length": 174.97649170809927,
345
+ "max_text_length": 25247,
346
+ "unique_texts": 1481128
347
+ },
348
+ "documents_image_statistics": null,
349
+ "queries_text_statistics": {
350
+ "total_text_length": 4606,
351
+ "min_text_length": 5,
352
+ "average_text_length": 21.624413145539908,
353
+ "max_text_length": 92,
354
+ "unique_texts": 213
355
+ },
356
+ "queries_image_statistics": null,
357
+ "relevant_docs_statistics": {
358
+ "num_relevant_docs": 547,
359
+ "min_relevant_docs_per_query": 9,
360
+ "average_relevant_docs_per_query": 2.568075117370892,
361
+ "max_relevant_docs_per_query": 20,
362
+ "unique_relevant_docs": 2835
363
+ },
364
+ "top_ranked_statistics": null
378
365
  },
379
366
  "ru": {
380
367
  "num_samples": 9545170,
381
- "number_of_characters": 3170998510,
382
- "num_documents": 9543918,
383
- "min_document_length": 3,
384
- "average_document_length": 332.2475377512674,
385
- "max_document_length": 61659,
386
- "unique_documents": 9543918,
387
- "num_queries": 1252,
388
- "min_query_length": 15,
389
- "average_query_length": 44.13258785942492,
390
- "max_query_length": 108,
391
- "unique_queries": 1252,
392
- "none_queries": 0,
393
- "num_relevant_docs": 13100,
394
- "min_relevant_docs_per_query": 9,
395
- "average_relevant_docs_per_query": 2.8434504792332267,
396
- "max_relevant_docs_per_query": 18,
397
- "unique_relevant_docs": 12607,
398
- "num_instructions": null,
399
- "min_instruction_length": null,
400
- "average_instruction_length": null,
401
- "max_instruction_length": null,
402
- "unique_instructions": null,
403
- "num_top_ranked": null,
404
- "min_top_ranked_per_query": null,
405
- "average_top_ranked_per_query": null,
406
- "max_top_ranked_per_query": null
368
+ "number_of_characters": 3180542428,
369
+ "documents_text_statistics": {
370
+ "total_text_length": 3180487174,
371
+ "min_text_length": 4,
372
+ "average_text_length": 333.2475377512674,
373
+ "max_text_length": 61660,
374
+ "unique_texts": 9524523
375
+ },
376
+ "documents_image_statistics": null,
377
+ "queries_text_statistics": {
378
+ "total_text_length": 55254,
379
+ "min_text_length": 15,
380
+ "average_text_length": 44.13258785942492,
381
+ "max_text_length": 108,
382
+ "unique_texts": 1252
383
+ },
384
+ "queries_image_statistics": null,
385
+ "relevant_docs_statistics": {
386
+ "num_relevant_docs": 3560,
387
+ "min_relevant_docs_per_query": 9,
388
+ "average_relevant_docs_per_query": 2.8434504792332267,
389
+ "max_relevant_docs_per_query": 18,
390
+ "unique_relevant_docs": 12607
391
+ },
392
+ "top_ranked_statistics": null
407
393
  },
408
394
  "sw": {
409
395
  "num_samples": 132406,
410
- "number_of_characters": 30191582,
411
- "num_documents": 131924,
412
- "min_document_length": 6,
413
- "average_document_length": 228.71348655286377,
414
- "max_document_length": 11203,
415
- "unique_documents": 131924,
416
- "num_queries": 482,
417
- "min_query_length": 13,
418
- "average_query_length": 38.97095435684647,
419
- "max_query_length": 80,
420
- "unique_queries": 482,
421
- "none_queries": 0,
422
- "num_relevant_docs": 5092,
423
- "min_relevant_docs_per_query": 1,
424
- "average_relevant_docs_per_query": 1.887966804979253,
425
- "max_relevant_docs_per_query": 17,
426
- "unique_relevant_docs": 3514,
427
- "num_instructions": null,
428
- "min_instruction_length": null,
429
- "average_instruction_length": null,
430
- "max_instruction_length": null,
431
- "unique_instructions": null,
432
- "num_top_ranked": null,
433
- "min_top_ranked_per_query": null,
434
- "average_top_ranked_per_query": null,
435
- "max_top_ranked_per_query": null
396
+ "number_of_characters": 30323506,
397
+ "documents_text_statistics": {
398
+ "total_text_length": 30304722,
399
+ "min_text_length": 7,
400
+ "average_text_length": 229.71348655286377,
401
+ "max_text_length": 11204,
402
+ "unique_texts": 131815
403
+ },
404
+ "documents_image_statistics": null,
405
+ "queries_text_statistics": {
406
+ "total_text_length": 18784,
407
+ "min_text_length": 13,
408
+ "average_text_length": 38.97095435684647,
409
+ "max_text_length": 80,
410
+ "unique_texts": 482
411
+ },
412
+ "queries_image_statistics": null,
413
+ "relevant_docs_statistics": {
414
+ "num_relevant_docs": 910,
415
+ "min_relevant_docs_per_query": 1,
416
+ "average_relevant_docs_per_query": 1.887966804979253,
417
+ "max_relevant_docs_per_query": 17,
418
+ "unique_relevant_docs": 3514
419
+ },
420
+ "top_ranked_statistics": null
436
421
  },
437
422
  "te": {
438
423
  "num_samples": 518907,
439
- "number_of_characters": 205300087,
440
- "num_documents": 518079,
441
- "min_document_length": 5,
442
- "average_document_length": 396.2108674545774,
443
- "max_document_length": 17850,
444
- "unique_documents": 518079,
445
- "num_queries": 828,
446
- "min_query_length": 14,
447
- "average_query_length": 38.11231884057971,
448
- "max_query_length": 111,
449
- "unique_queries": 828,
450
- "none_queries": 0,
451
- "num_relevant_docs": 1606,
452
- "min_relevant_docs_per_query": 1,
453
- "average_relevant_docs_per_query": 1.0314009661835748,
454
- "max_relevant_docs_per_query": 11,
455
- "unique_relevant_docs": 1457,
456
- "num_instructions": null,
457
- "min_instruction_length": null,
458
- "average_instruction_length": null,
459
- "max_instruction_length": null,
460
- "unique_instructions": null,
461
- "num_top_ranked": null,
462
- "min_top_ranked_per_query": null,
463
- "average_top_ranked_per_query": null,
464
- "max_top_ranked_per_query": null
424
+ "number_of_characters": 205818166,
425
+ "documents_text_statistics": {
426
+ "total_text_length": 205786609,
427
+ "min_text_length": 6,
428
+ "average_text_length": 397.2108674545774,
429
+ "max_text_length": 17851,
430
+ "unique_texts": 517192
431
+ },
432
+ "documents_image_statistics": null,
433
+ "queries_text_statistics": {
434
+ "total_text_length": 31557,
435
+ "min_text_length": 14,
436
+ "average_text_length": 38.11231884057971,
437
+ "max_text_length": 111,
438
+ "unique_texts": 828
439
+ },
440
+ "queries_image_statistics": null,
441
+ "relevant_docs_statistics": {
442
+ "num_relevant_docs": 854,
443
+ "min_relevant_docs_per_query": 1,
444
+ "average_relevant_docs_per_query": 1.0314009661835748,
445
+ "max_relevant_docs_per_query": 11,
446
+ "unique_relevant_docs": 1457
447
+ },
448
+ "top_ranked_statistics": null
465
449
  },
466
450
  "th": {
467
451
  "num_samples": 542899,
468
- "number_of_characters": 193491627,
469
- "num_documents": 542166,
470
- "min_document_length": 5,
471
- "average_document_length": 356.8283496198581,
472
- "max_document_length": 31250,
473
- "unique_documents": 542166,
474
- "num_queries": 733,
475
- "min_query_length": 14,
476
- "average_query_length": 42.87585266030014,
477
- "max_query_length": 176,
478
- "unique_queries": 733,
479
- "none_queries": 0,
480
- "num_relevant_docs": 7573,
481
- "min_relevant_docs_per_query": 1,
482
- "average_relevant_docs_per_query": 1.8321964529331514,
483
- "max_relevant_docs_per_query": 15,
484
- "unique_relevant_docs": 6868,
485
- "num_instructions": null,
486
- "min_instruction_length": null,
487
- "average_instruction_length": null,
488
- "max_instruction_length": null,
489
- "unique_instructions": null,
490
- "num_top_ranked": null,
491
- "min_top_ranked_per_query": null,
492
- "average_top_ranked_per_query": null,
493
- "max_top_ranked_per_query": null
452
+ "number_of_characters": 194033793,
453
+ "documents_text_statistics": {
454
+ "total_text_length": 194002365,
455
+ "min_text_length": 6,
456
+ "average_text_length": 357.8283496198581,
457
+ "max_text_length": 31251,
458
+ "unique_texts": 539602
459
+ },
460
+ "documents_image_statistics": null,
461
+ "queries_text_statistics": {
462
+ "total_text_length": 31428,
463
+ "min_text_length": 14,
464
+ "average_text_length": 42.87585266030014,
465
+ "max_text_length": 176,
466
+ "unique_texts": 733
467
+ },
468
+ "queries_image_statistics": null,
469
+ "relevant_docs_statistics": {
470
+ "num_relevant_docs": 1343,
471
+ "min_relevant_docs_per_query": 1,
472
+ "average_relevant_docs_per_query": 1.8321964529331514,
473
+ "max_relevant_docs_per_query": 15,
474
+ "unique_relevant_docs": 6868
475
+ },
476
+ "top_ranked_statistics": null
494
477
  },
495
478
  "yo": {
496
479
  "num_samples": 49162,
497
- "number_of_characters": 7819610,
498
- "num_documents": 49043,
499
- "min_document_length": 2,
500
- "average_document_length": 159.35250698366738,
501
- "max_document_length": 10469,
502
- "unique_documents": 49043,
503
- "num_queries": 119,
504
- "min_query_length": 25,
505
- "average_query_length": 37.6890756302521,
506
- "max_query_length": 56,
507
- "unique_queries": 119,
508
- "none_queries": 0,
509
- "num_relevant_docs": 1188,
510
- "min_relevant_docs_per_query": 9,
511
- "average_relevant_docs_per_query": 1.2100840336134453,
512
- "max_relevant_docs_per_query": 10,
513
- "unique_relevant_docs": 942,
514
- "num_instructions": null,
515
- "min_instruction_length": null,
516
- "average_instruction_length": null,
517
- "max_instruction_length": null,
518
- "unique_instructions": null,
519
- "num_top_ranked": null,
520
- "min_top_ranked_per_query": null,
521
- "average_top_ranked_per_query": null,
522
- "max_top_ranked_per_query": null
480
+ "number_of_characters": 7868653,
481
+ "documents_text_statistics": {
482
+ "total_text_length": 7864168,
483
+ "min_text_length": 3,
484
+ "average_text_length": 160.35250698366738,
485
+ "max_text_length": 10470,
486
+ "unique_texts": 48441
487
+ },
488
+ "documents_image_statistics": null,
489
+ "queries_text_statistics": {
490
+ "total_text_length": 4485,
491
+ "min_text_length": 25,
492
+ "average_text_length": 37.6890756302521,
493
+ "max_text_length": 56,
494
+ "unique_texts": 119
495
+ },
496
+ "queries_image_statistics": null,
497
+ "relevant_docs_statistics": {
498
+ "num_relevant_docs": 144,
499
+ "min_relevant_docs_per_query": 9,
500
+ "average_relevant_docs_per_query": 1.2100840336134453,
501
+ "max_relevant_docs_per_query": 10,
502
+ "unique_relevant_docs": 942
503
+ },
504
+ "top_ranked_statistics": null
523
505
  },
524
506
  "zh": {
525
507
  "num_samples": 4934761,
526
- "number_of_characters": 591861448,
527
- "num_documents": 4934368,
528
- "min_document_length": 2,
529
- "average_document_length": 119.9458931721347,
530
- "max_document_length": 84930,
531
- "unique_documents": 4934368,
532
- "num_queries": 393,
533
- "min_query_length": 7,
534
- "average_query_length": 10.867684478371501,
535
- "max_query_length": 22,
536
- "unique_queries": 393,
537
- "none_queries": 0,
538
- "num_relevant_docs": 3928,
539
- "min_relevant_docs_per_query": 8,
540
- "average_relevant_docs_per_query": 2.5292620865139948,
541
- "max_relevant_docs_per_query": 10,
542
- "unique_relevant_docs": 3786,
543
- "num_instructions": null,
544
- "min_instruction_length": null,
545
- "average_instruction_length": null,
546
- "max_instruction_length": null,
547
- "unique_instructions": null,
548
- "num_top_ranked": null,
549
- "min_top_ranked_per_query": null,
550
- "average_top_ranked_per_query": null,
551
- "max_top_ranked_per_query": null
508
+ "number_of_characters": 596795816,
509
+ "documents_text_statistics": {
510
+ "total_text_length": 596791545,
511
+ "min_text_length": 3,
512
+ "average_text_length": 120.9458931721347,
513
+ "max_text_length": 84931,
514
+ "unique_texts": 4922624
515
+ },
516
+ "documents_image_statistics": null,
517
+ "queries_text_statistics": {
518
+ "total_text_length": 4271,
519
+ "min_text_length": 7,
520
+ "average_text_length": 10.867684478371501,
521
+ "max_text_length": 22,
522
+ "unique_texts": 390
523
+ },
524
+ "queries_image_statistics": null,
525
+ "relevant_docs_statistics": {
526
+ "num_relevant_docs": 994,
527
+ "min_relevant_docs_per_query": 8,
528
+ "average_relevant_docs_per_query": 2.5292620865139948,
529
+ "max_relevant_docs_per_query": 10,
530
+ "unique_relevant_docs": 3786
531
+ },
532
+ "top_ranked_statistics": null
552
533
  }
553
534
  }
554
535
  }