mteb 2.1.0__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +2 -0
- mteb/abstasks/_stratification.py +1 -1
- mteb/abstasks/abstask.py +6 -1
- mteb/abstasks/dataset_card_template.md +1 -1
- mteb/abstasks/retrieval.py +2 -1
- mteb/abstasks/retrieval_dataset_loaders.py +1 -1
- mteb/abstasks/task_metadata.py +1 -1
- mteb/benchmarks/benchmarks/benchmarks.py +7 -11
- mteb/benchmarks/get_benchmark.py +1 -1
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XFlickr30kCoT2IRetrieval.json +243 -153
- mteb/descriptive_stats/Image/Any2AnyMultilingualRetrieval/XM3600T2IRetrieval.json +999 -629
- mteb/descriptive_stats/Image/Any2AnyRetrieval/OVENIT2TRetrieval.json +33 -17
- mteb/descriptive_stats/Image/DocumentUnderstanding/MIRACLVisionRetrieval.json +574 -0
- mteb/descriptive_stats/Retrieval/ClimateFEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/DBPediaHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/FEVERHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/HotpotQAHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/QuoraRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/RiaNewsRetrievalHardNegatives.v2.json +30 -0
- mteb/descriptive_stats/Retrieval/VDRMultilingualRetrieval.json +184 -0
- mteb/languages/check_language_code.py +11 -3
- mteb/languages/language_scripts.py +4 -0
- mteb/leaderboard/text_segments.py +1 -1
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +1 -3
- mteb/models/model_implementations/bmretriever_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +2 -2
- mteb/models/model_implementations/ibm_granite_models.py +1 -1
- mteb/models/model_implementations/inf_models.py +3 -3
- mteb/models/model_implementations/jina_models.py +12 -2
- mteb/models/model_implementations/llm2vec_models.py +1 -1
- mteb/models/model_implementations/misc_models.py +2 -2
- mteb/models/model_implementations/mxbai_models.py +1 -1
- mteb/models/model_implementations/salesforce_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +1 -1
- mteb/models/model_implementations/voyage_v.py +9 -9
- mteb/results/task_result.py +6 -8
- mteb/tasks/classification/dan/angry_tweets_classification.py +2 -2
- mteb/tasks/classification/eng/legal_bench_classification.py +3 -3
- mteb/tasks/classification/mya/myanmar_news.py +2 -2
- mteb/tasks/classification/tha/wongnai_reviews_classification.py +1 -1
- mteb/tasks/classification/ukr/ukr_formality_classification.py +2 -2
- mteb/tasks/pair_classification/multilingual/indic_xnli_pair_classification.py +9 -8
- mteb/tasks/retrieval/code/code_rag.py +8 -8
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +18 -4
- mteb/tasks/retrieval/eng/climate_fever_retrieval.py +68 -77
- mteb/tasks/retrieval/eng/dbpedia_retrieval.py +55 -50
- mteb/tasks/retrieval/eng/fever_retrieval.py +62 -67
- mteb/tasks/retrieval/eng/hateful_memes_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hateful_memes_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/hotpot_qa_retrieval.py +57 -67
- mteb/tasks/retrieval/eng/legal_summarization_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/memotion_i2t_retrieval.py +0 -3
- mteb/tasks/retrieval/eng/memotion_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/eng/oven_it2t_retrieval.py +1 -1
- mteb/tasks/retrieval/eng/quora_retrieval.py +51 -46
- mteb/tasks/retrieval/eng/sci_mmir_i2t_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/sci_mmir_t2i_retrieval.py +0 -4
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/jpn/ja_gov_faqs_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/belebele_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/miracl_retrieval.py +1 -1
- mteb/tasks/retrieval/multilingual/miracl_vision_retrieval.py +2 -9
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/wit_t2i_retrieval.py +0 -2
- mteb/tasks/retrieval/multilingual/x_flickr30k_co_t2i_retrieval.py +6 -5
- mteb/tasks/retrieval/multilingual/xm3600_t2i_retrieval.py +3 -4
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/rus/__init__.py +11 -2
- mteb/tasks/retrieval/rus/ria_news_retrieval.py +48 -44
- mteb/tasks/retrieval/tur/tur_hist_quad.py +2 -2
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/METADATA +5 -5
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/RECORD +82 -87
- mteb/descriptive_stats/Classification/PersianTextTone.json +0 -56
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchCount.json +0 -37
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDepth.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchDistance.json +0 -25
- mteb/descriptive_stats/Image/Any2TextMutipleChoice/CVBenchRelation.json +0 -25
- mteb/descriptive_stats/Image/VisualSTS/STS12VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS13VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS14VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS15VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS16VisualSTS.json +0 -20
- mteb/descriptive_stats/Image/VisualSTS/STS17MultilingualVisualSTS.json +0 -220
- mteb/descriptive_stats/Image/VisualSTS/STSBenchmarkMultilingualVisualSTS.json +0 -402
- mteb/descriptive_stats/Reranking/InstructIR.json +0 -31
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/WHEEL +0 -0
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.1.0.dist-info → mteb-2.1.1.dist-info}/top_level.txt +0 -0
mteb/_create_dataloaders.py
CHANGED
|
@@ -277,6 +277,8 @@ def _custom_collate_fn(batch: list[dict[str, Any]]) -> dict[str, Any]:
|
|
|
277
277
|
# Leave the images as a list to avoid stacking errors.
|
|
278
278
|
collated[key] = [item[key] for item in batch]
|
|
279
279
|
else:
|
|
280
|
+
if any(item[key] is None for item in batch):
|
|
281
|
+
raise ValueError(f"Found None in batch for key '{key}'")
|
|
280
282
|
collated[key] = default_collate([item[key] for item in batch])
|
|
281
283
|
return collated
|
|
282
284
|
|
mteb/abstasks/_stratification.py
CHANGED
|
@@ -134,7 +134,7 @@ def _get_most_desired_combination(samples_with_combination: dict):
|
|
|
134
134
|
class IterativeStratification(_BaseKFold):
|
|
135
135
|
"""Iteratively stratify a multi-label data set into folds
|
|
136
136
|
|
|
137
|
-
Construct an
|
|
137
|
+
Construct an iterative stratifier that splits the data set into folds trying to maintain balanced representation
|
|
138
138
|
with respect to order-th label combinations.
|
|
139
139
|
"""
|
|
140
140
|
|
mteb/abstasks/abstask.py
CHANGED
|
@@ -459,7 +459,7 @@ class AbsTask(ABC):
|
|
|
459
459
|
"""Filter the languages of the task.
|
|
460
460
|
|
|
461
461
|
Args:
|
|
462
|
-
languages: list of languages to filter the task by can be either a 3-letter
|
|
462
|
+
languages: list of languages to filter the task by can be either a 3-letter language code (e.g. "eng") or also include the script
|
|
463
463
|
(e.g. "eng-Latn")
|
|
464
464
|
script: A list of scripts to filter the task by. Will be ignored if language code specified the script. If None, all scripts are included.
|
|
465
465
|
If the language code does not specify the script the intersection of the language and script will be used.
|
|
@@ -491,6 +491,11 @@ class AbsTask(ABC):
|
|
|
491
491
|
if lang_scripts.contains_languages(langs):
|
|
492
492
|
subsets_to_keep.append(hf_subset)
|
|
493
493
|
|
|
494
|
+
if len(subsets_to_keep) == 0:
|
|
495
|
+
raise ValueError(
|
|
496
|
+
f"No subsets were found for {self.metadata.name} with filters: language code {languages}, script {script}, hf subsets {hf_subsets}."
|
|
497
|
+
)
|
|
498
|
+
|
|
494
499
|
self.hf_subsets = subsets_to_keep
|
|
495
500
|
return self
|
|
496
501
|
|
mteb/abstasks/retrieval.py
CHANGED
|
@@ -653,6 +653,8 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
653
653
|
FileNotFoundError: If the specified path does not exist.
|
|
654
654
|
ValueError: If the loaded top ranked results are not in the expected format.
|
|
655
655
|
"""
|
|
656
|
+
self._top_k = top_k
|
|
657
|
+
|
|
656
658
|
top_ranked_path = Path(top_ranked_path)
|
|
657
659
|
if top_ranked_path.is_dir():
|
|
658
660
|
top_ranked_path = self._predictions_path(top_ranked_path)
|
|
@@ -682,7 +684,6 @@ class AbsTaskRetrieval(AbsTask):
|
|
|
682
684
|
top_k_sorted[query_id] = sorted_keys[: self._top_k]
|
|
683
685
|
|
|
684
686
|
self.dataset[subset][split]["top_ranked"] = top_k_sorted
|
|
685
|
-
self._top_k = top_k
|
|
686
687
|
return self
|
|
687
688
|
|
|
688
689
|
|
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -532,7 +532,7 @@ class TaskMetadata(BaseModel):
|
|
|
532
532
|
citation=self.bibtex_citation,
|
|
533
533
|
dataset_description=self.description,
|
|
534
534
|
dataset_reference=self.reference,
|
|
535
|
-
|
|
535
|
+
descriptive_stats=descriptive_stats,
|
|
536
536
|
dataset_task_name=self.name,
|
|
537
537
|
category=self.category,
|
|
538
538
|
domains=", ".join(self.domains) if self.domains else None,
|
|
@@ -641,7 +641,7 @@ MTEB_KOR = Benchmark(
|
|
|
641
641
|
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
|
|
642
642
|
tasks=get_tasks(
|
|
643
643
|
languages=["kor"],
|
|
644
|
-
tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid
|
|
644
|
+
tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchmark, but for now I have left it as is.
|
|
645
645
|
# Classification
|
|
646
646
|
"KLUE-TC",
|
|
647
647
|
# Reranking
|
|
@@ -975,8 +975,6 @@ MTEB_INDIC = Benchmark(
|
|
|
975
975
|
# Bitext
|
|
976
976
|
"IN22ConvBitextMining",
|
|
977
977
|
"IN22GenBitextMining",
|
|
978
|
-
"IndicGenBenchFloresBitextMining",
|
|
979
|
-
"LinceMTBitextMining",
|
|
980
978
|
# clustering
|
|
981
979
|
"SIB200ClusteringS2S",
|
|
982
980
|
# classification
|
|
@@ -985,7 +983,6 @@ MTEB_INDIC = Benchmark(
|
|
|
985
983
|
"HindiDiscourseClassification",
|
|
986
984
|
"SentimentAnalysisHindi",
|
|
987
985
|
"MalayalamNewsClassification",
|
|
988
|
-
"IndicLangClassification",
|
|
989
986
|
"MTOPIntentClassification",
|
|
990
987
|
"MultiHateClassification",
|
|
991
988
|
"TweetSentimentClassification",
|
|
@@ -1008,7 +1005,7 @@ MTEB_INDIC = Benchmark(
|
|
|
1008
1005
|
# STS
|
|
1009
1006
|
(get_task("IndicCrosslingualSTS"),)
|
|
1010
1007
|
),
|
|
1011
|
-
description="A regional geopolitical text embedding benchmark
|
|
1008
|
+
description="A regional geopolitical text embedding benchmark targeting embedding performance on Indic languages.",
|
|
1012
1009
|
reference=None,
|
|
1013
1010
|
citation=MMTEB_CITATION,
|
|
1014
1011
|
contacts=["KennethEnevoldsen", "isaac-chung"],
|
|
@@ -1016,7 +1013,7 @@ MTEB_INDIC = Benchmark(
|
|
|
1016
1013
|
|
|
1017
1014
|
|
|
1018
1015
|
eu_languages = [
|
|
1019
|
-
# official EU languages (56) - we could include the whole economic area e.g. Norway -
|
|
1016
|
+
# official EU languages (56) - we could include the whole economic area e.g. Norway - additionally we could include minority languages (probably a good idea?)
|
|
1020
1017
|
# germanic
|
|
1021
1018
|
"dan",
|
|
1022
1019
|
"eng",
|
|
@@ -1084,7 +1081,6 @@ MTEB_EU = Benchmark(
|
|
|
1084
1081
|
"AmazonCounterfactualClassification",
|
|
1085
1082
|
"MassiveScenarioClassification",
|
|
1086
1083
|
"MultiHateClassification",
|
|
1087
|
-
"NordicLangClassification",
|
|
1088
1084
|
"ScalaClassification",
|
|
1089
1085
|
"SwissJudgementClassification",
|
|
1090
1086
|
"TweetSentimentClassification",
|
|
@@ -1142,7 +1138,7 @@ MTEB_EU = Benchmark(
|
|
|
1142
1138
|
languages=eu_languages,
|
|
1143
1139
|
exclusive_language_filter=True,
|
|
1144
1140
|
),
|
|
1145
|
-
description="A regional geopolitical text embedding benchmark
|
|
1141
|
+
description="A regional geopolitical text embedding benchmark targeting embedding performance on European languages.",
|
|
1146
1142
|
reference=None,
|
|
1147
1143
|
citation=MMTEB_CITATION,
|
|
1148
1144
|
contacts=["KennethEnevoldsen", "isaac-chung"],
|
|
@@ -1858,7 +1854,7 @@ MIEB_ENG = MIEBBenchmark(
|
|
|
1858
1854
|
),
|
|
1859
1855
|
description="""MIEB(eng) is a comprehensive image embeddings benchmark, spanning 8 task types, covering 125 tasks.
|
|
1860
1856
|
In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
|
|
1861
|
-
document
|
|
1857
|
+
document understanding, visual STS, and CV-centric tasks.""",
|
|
1862
1858
|
reference="https://arxiv.org/abs/2504.10471",
|
|
1863
1859
|
contacts=["gowitheflow-1998", "isaac-chung"],
|
|
1864
1860
|
citation=r"""
|
|
@@ -1892,7 +1888,7 @@ MIEB_MULTILINGUAL = MIEBBenchmark(
|
|
|
1892
1888
|
),
|
|
1893
1889
|
description="""MIEB(Multilingual) is a comprehensive image embeddings benchmark, spanning 10 task types, covering 130 tasks and a total of 39 languages.
|
|
1894
1890
|
In addition to image classification (zero shot and linear probing), clustering, retrieval, MIEB includes tasks in compositionality evaluation,
|
|
1895
|
-
document
|
|
1891
|
+
document understanding, visual STS, and CV-centric tasks. This benchmark consists of MIEB(eng) + 3 multilingual retrieval
|
|
1896
1892
|
datasets + the multilingual parts of VisualSTS-b and VisualSTS-16.""",
|
|
1897
1893
|
reference="https://arxiv.org/abs/2504.10471",
|
|
1898
1894
|
contacts=["gowitheflow-1998", "isaac-chung"],
|
|
@@ -2113,7 +2109,7 @@ BUILT_MTEB = Benchmark(
|
|
|
2113
2109
|
"BuiltBenchReranking",
|
|
2114
2110
|
],
|
|
2115
2111
|
),
|
|
2116
|
-
description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various
|
|
2112
|
+
description='"Built-Bench" is an ongoing effort aimed at evaluating text embedding models in the context of built asset management, spanning over various disciplines such as architecture, engineering, construction, and operations management of the built environment.',
|
|
2117
2113
|
reference="https://arxiv.org/abs/2411.12056",
|
|
2118
2114
|
citation=r"""
|
|
2119
2115
|
@article{shahinmoghadam2024benchmarking,
|
mteb/benchmarks/get_benchmark.py
CHANGED
|
@@ -14,7 +14,7 @@ def _build_registry() -> dict[str, Benchmark]:
|
|
|
14
14
|
|
|
15
15
|
benchmark_registry = {
|
|
16
16
|
inst.name: inst
|
|
17
|
-
for
|
|
17
|
+
for _, inst in benchmark_module.__dict__.items()
|
|
18
18
|
if isinstance(inst, Benchmark)
|
|
19
19
|
}
|
|
20
20
|
return benchmark_registry
|
|
@@ -1,183 +1,273 @@
|
|
|
1
1
|
{
|
|
2
2
|
"test": {
|
|
3
|
-
"number_of_characters": 1149877,
|
|
4
3
|
"num_samples": 32000,
|
|
5
|
-
"
|
|
6
|
-
"
|
|
7
|
-
"
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
4
|
+
"number_of_characters": 1021877,
|
|
5
|
+
"documents_text_statistics": null,
|
|
6
|
+
"documents_image_statistics": {
|
|
7
|
+
"min_image_width": 176,
|
|
8
|
+
"average_image_width": 514.5045,
|
|
9
|
+
"max_image_width": 640,
|
|
10
|
+
"min_image_height": 144,
|
|
11
|
+
"average_image_height": 444.223,
|
|
12
|
+
"max_image_height": 640,
|
|
13
|
+
"unique_images": 2000
|
|
14
|
+
},
|
|
15
|
+
"queries_text_statistics": {
|
|
16
|
+
"total_text_length": 1021877,
|
|
17
|
+
"min_text_length": 4,
|
|
18
|
+
"average_text_length": 63.8673125,
|
|
19
|
+
"max_text_length": 377,
|
|
20
|
+
"unique_texts": 15986
|
|
21
|
+
},
|
|
22
|
+
"queries_image_statistics": null,
|
|
23
|
+
"relevant_docs_statistics": {
|
|
24
|
+
"num_relevant_docs": 16000,
|
|
25
|
+
"min_relevant_docs_per_query": 1,
|
|
26
|
+
"average_relevant_docs_per_query": 1.0,
|
|
27
|
+
"max_relevant_docs_per_query": 1,
|
|
28
|
+
"unique_relevant_docs": 16000
|
|
29
|
+
},
|
|
30
|
+
"top_ranked_statistics": null,
|
|
21
31
|
"hf_subset_descriptive_stats": {
|
|
22
32
|
"de": {
|
|
23
|
-
"number_of_characters": 132154,
|
|
24
33
|
"num_samples": 4000,
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
"
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
"
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
34
|
+
"number_of_characters": 132154,
|
|
35
|
+
"documents_text_statistics": null,
|
|
36
|
+
"documents_image_statistics": {
|
|
37
|
+
"min_image_width": 176,
|
|
38
|
+
"average_image_width": 514.5045,
|
|
39
|
+
"max_image_width": 640,
|
|
40
|
+
"min_image_height": 144,
|
|
41
|
+
"average_image_height": 444.223,
|
|
42
|
+
"max_image_height": 640,
|
|
43
|
+
"unique_images": 2000
|
|
44
|
+
},
|
|
45
|
+
"queries_text_statistics": {
|
|
46
|
+
"total_text_length": 132154,
|
|
47
|
+
"min_text_length": 4,
|
|
48
|
+
"average_text_length": 66.077,
|
|
49
|
+
"max_text_length": 220,
|
|
50
|
+
"unique_texts": 1994
|
|
51
|
+
},
|
|
52
|
+
"queries_image_statistics": null,
|
|
53
|
+
"relevant_docs_statistics": {
|
|
54
|
+
"num_relevant_docs": 2000,
|
|
55
|
+
"min_relevant_docs_per_query": 1,
|
|
56
|
+
"average_relevant_docs_per_query": 1.0,
|
|
57
|
+
"max_relevant_docs_per_query": 1,
|
|
58
|
+
"unique_relevant_docs": 2000
|
|
59
|
+
},
|
|
60
|
+
"top_ranked_statistics": null
|
|
41
61
|
},
|
|
42
62
|
"en": {
|
|
43
|
-
"number_of_characters": 153801,
|
|
44
63
|
"num_samples": 4000,
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
"
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
64
|
+
"number_of_characters": 153801,
|
|
65
|
+
"documents_text_statistics": null,
|
|
66
|
+
"documents_image_statistics": {
|
|
67
|
+
"min_image_width": 176,
|
|
68
|
+
"average_image_width": 514.5045,
|
|
69
|
+
"max_image_width": 640,
|
|
70
|
+
"min_image_height": 144,
|
|
71
|
+
"average_image_height": 444.223,
|
|
72
|
+
"max_image_height": 640,
|
|
73
|
+
"unique_images": 2000
|
|
74
|
+
},
|
|
75
|
+
"queries_text_statistics": {
|
|
76
|
+
"total_text_length": 153801,
|
|
77
|
+
"min_text_length": 34,
|
|
78
|
+
"average_text_length": 76.9005,
|
|
79
|
+
"max_text_length": 377,
|
|
80
|
+
"unique_texts": 2000
|
|
81
|
+
},
|
|
82
|
+
"queries_image_statistics": null,
|
|
83
|
+
"relevant_docs_statistics": {
|
|
84
|
+
"num_relevant_docs": 2000,
|
|
85
|
+
"min_relevant_docs_per_query": 1,
|
|
86
|
+
"average_relevant_docs_per_query": 1.0,
|
|
87
|
+
"max_relevant_docs_per_query": 1,
|
|
88
|
+
"unique_relevant_docs": 2000
|
|
89
|
+
},
|
|
90
|
+
"top_ranked_statistics": null
|
|
61
91
|
},
|
|
62
92
|
"es": {
|
|
63
|
-
"number_of_characters": 160049,
|
|
64
93
|
"num_samples": 4000,
|
|
65
|
-
"
|
|
66
|
-
"
|
|
67
|
-
"
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
"
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
94
|
+
"number_of_characters": 160049,
|
|
95
|
+
"documents_text_statistics": null,
|
|
96
|
+
"documents_image_statistics": {
|
|
97
|
+
"min_image_width": 176,
|
|
98
|
+
"average_image_width": 514.5045,
|
|
99
|
+
"max_image_width": 640,
|
|
100
|
+
"min_image_height": 144,
|
|
101
|
+
"average_image_height": 444.223,
|
|
102
|
+
"max_image_height": 640,
|
|
103
|
+
"unique_images": 2000
|
|
104
|
+
},
|
|
105
|
+
"queries_text_statistics": {
|
|
106
|
+
"total_text_length": 160049,
|
|
107
|
+
"min_text_length": 23,
|
|
108
|
+
"average_text_length": 80.0245,
|
|
109
|
+
"max_text_length": 342,
|
|
110
|
+
"unique_texts": 2000
|
|
111
|
+
},
|
|
112
|
+
"queries_image_statistics": null,
|
|
113
|
+
"relevant_docs_statistics": {
|
|
114
|
+
"num_relevant_docs": 2000,
|
|
115
|
+
"min_relevant_docs_per_query": 1,
|
|
116
|
+
"average_relevant_docs_per_query": 1.0,
|
|
117
|
+
"max_relevant_docs_per_query": 1,
|
|
118
|
+
"unique_relevant_docs": 2000
|
|
119
|
+
},
|
|
120
|
+
"top_ranked_statistics": null
|
|
81
121
|
},
|
|
82
122
|
"id": {
|
|
83
|
-
"number_of_characters": 167858,
|
|
84
123
|
"num_samples": 4000,
|
|
85
|
-
"
|
|
86
|
-
"
|
|
87
|
-
"
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
"
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
124
|
+
"number_of_characters": 167858,
|
|
125
|
+
"documents_text_statistics": null,
|
|
126
|
+
"documents_image_statistics": {
|
|
127
|
+
"min_image_width": 176,
|
|
128
|
+
"average_image_width": 514.5045,
|
|
129
|
+
"max_image_width": 640,
|
|
130
|
+
"min_image_height": 144,
|
|
131
|
+
"average_image_height": 444.223,
|
|
132
|
+
"max_image_height": 640,
|
|
133
|
+
"unique_images": 2000
|
|
134
|
+
},
|
|
135
|
+
"queries_text_statistics": {
|
|
136
|
+
"total_text_length": 167858,
|
|
137
|
+
"min_text_length": 4,
|
|
138
|
+
"average_text_length": 83.929,
|
|
139
|
+
"max_text_length": 211,
|
|
140
|
+
"unique_texts": 2000
|
|
141
|
+
},
|
|
142
|
+
"queries_image_statistics": null,
|
|
143
|
+
"relevant_docs_statistics": {
|
|
144
|
+
"num_relevant_docs": 2000,
|
|
145
|
+
"min_relevant_docs_per_query": 1,
|
|
146
|
+
"average_relevant_docs_per_query": 1.0,
|
|
147
|
+
"max_relevant_docs_per_query": 1,
|
|
148
|
+
"unique_relevant_docs": 2000
|
|
149
|
+
},
|
|
150
|
+
"top_ranked_statistics": null
|
|
101
151
|
},
|
|
102
152
|
"ja": {
|
|
103
|
-
"number_of_characters": 75480,
|
|
104
153
|
"num_samples": 4000,
|
|
105
|
-
"
|
|
106
|
-
"
|
|
107
|
-
"
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
"
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
154
|
+
"number_of_characters": 75480,
|
|
155
|
+
"documents_text_statistics": null,
|
|
156
|
+
"documents_image_statistics": {
|
|
157
|
+
"min_image_width": 176,
|
|
158
|
+
"average_image_width": 514.5045,
|
|
159
|
+
"max_image_width": 640,
|
|
160
|
+
"min_image_height": 144,
|
|
161
|
+
"average_image_height": 444.223,
|
|
162
|
+
"max_image_height": 640,
|
|
163
|
+
"unique_images": 2000
|
|
164
|
+
},
|
|
165
|
+
"queries_text_statistics": {
|
|
166
|
+
"total_text_length": 75480,
|
|
167
|
+
"min_text_length": 9,
|
|
168
|
+
"average_text_length": 37.74,
|
|
169
|
+
"max_text_length": 179,
|
|
170
|
+
"unique_texts": 2000
|
|
171
|
+
},
|
|
172
|
+
"queries_image_statistics": null,
|
|
173
|
+
"relevant_docs_statistics": {
|
|
174
|
+
"num_relevant_docs": 2000,
|
|
175
|
+
"min_relevant_docs_per_query": 1,
|
|
176
|
+
"average_relevant_docs_per_query": 1.0,
|
|
177
|
+
"max_relevant_docs_per_query": 1,
|
|
178
|
+
"unique_relevant_docs": 2000
|
|
179
|
+
},
|
|
180
|
+
"top_ranked_statistics": null
|
|
121
181
|
},
|
|
122
182
|
"ru": {
|
|
123
|
-
"number_of_characters": 149947,
|
|
124
183
|
"num_samples": 4000,
|
|
125
|
-
"
|
|
126
|
-
"
|
|
127
|
-
"
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
"
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
184
|
+
"number_of_characters": 149947,
|
|
185
|
+
"documents_text_statistics": null,
|
|
186
|
+
"documents_image_statistics": {
|
|
187
|
+
"min_image_width": 176,
|
|
188
|
+
"average_image_width": 514.5045,
|
|
189
|
+
"max_image_width": 640,
|
|
190
|
+
"min_image_height": 144,
|
|
191
|
+
"average_image_height": 444.223,
|
|
192
|
+
"max_image_height": 640,
|
|
193
|
+
"unique_images": 2000
|
|
194
|
+
},
|
|
195
|
+
"queries_text_statistics": {
|
|
196
|
+
"total_text_length": 149947,
|
|
197
|
+
"min_text_length": 10,
|
|
198
|
+
"average_text_length": 74.9735,
|
|
199
|
+
"max_text_length": 294,
|
|
200
|
+
"unique_texts": 1997
|
|
201
|
+
},
|
|
202
|
+
"queries_image_statistics": null,
|
|
203
|
+
"relevant_docs_statistics": {
|
|
204
|
+
"num_relevant_docs": 2000,
|
|
205
|
+
"min_relevant_docs_per_query": 1,
|
|
206
|
+
"average_relevant_docs_per_query": 1.0,
|
|
207
|
+
"max_relevant_docs_per_query": 1,
|
|
208
|
+
"unique_relevant_docs": 2000
|
|
209
|
+
},
|
|
210
|
+
"top_ranked_statistics": null
|
|
141
211
|
},
|
|
142
212
|
"tr": {
|
|
143
|
-
"number_of_characters": 136134,
|
|
144
213
|
"num_samples": 4000,
|
|
145
|
-
"
|
|
146
|
-
"
|
|
147
|
-
"
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
"
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
214
|
+
"number_of_characters": 136134,
|
|
215
|
+
"documents_text_statistics": null,
|
|
216
|
+
"documents_image_statistics": {
|
|
217
|
+
"min_image_width": 176,
|
|
218
|
+
"average_image_width": 514.5045,
|
|
219
|
+
"max_image_width": 640,
|
|
220
|
+
"min_image_height": 144,
|
|
221
|
+
"average_image_height": 444.223,
|
|
222
|
+
"max_image_height": 640,
|
|
223
|
+
"unique_images": 2000
|
|
224
|
+
},
|
|
225
|
+
"queries_text_statistics": {
|
|
226
|
+
"total_text_length": 136134,
|
|
227
|
+
"min_text_length": 19,
|
|
228
|
+
"average_text_length": 68.067,
|
|
229
|
+
"max_text_length": 199,
|
|
230
|
+
"unique_texts": 1997
|
|
231
|
+
},
|
|
232
|
+
"queries_image_statistics": null,
|
|
233
|
+
"relevant_docs_statistics": {
|
|
234
|
+
"num_relevant_docs": 2000,
|
|
235
|
+
"min_relevant_docs_per_query": 1,
|
|
236
|
+
"average_relevant_docs_per_query": 1.0,
|
|
237
|
+
"max_relevant_docs_per_query": 1,
|
|
238
|
+
"unique_relevant_docs": 2000
|
|
239
|
+
},
|
|
240
|
+
"top_ranked_statistics": null
|
|
161
241
|
},
|
|
162
242
|
"zh": {
|
|
163
|
-
"number_of_characters": 46454,
|
|
164
243
|
"num_samples": 4000,
|
|
165
|
-
"
|
|
166
|
-
"
|
|
167
|
-
"
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
"
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
244
|
+
"number_of_characters": 46454,
|
|
245
|
+
"documents_text_statistics": null,
|
|
246
|
+
"documents_image_statistics": {
|
|
247
|
+
"min_image_width": 176,
|
|
248
|
+
"average_image_width": 514.5045,
|
|
249
|
+
"max_image_width": 640,
|
|
250
|
+
"min_image_height": 144,
|
|
251
|
+
"average_image_height": 444.223,
|
|
252
|
+
"max_image_height": 640,
|
|
253
|
+
"unique_images": 2000
|
|
254
|
+
},
|
|
255
|
+
"queries_text_statistics": {
|
|
256
|
+
"total_text_length": 46454,
|
|
257
|
+
"min_text_length": 10,
|
|
258
|
+
"average_text_length": 23.227,
|
|
259
|
+
"max_text_length": 66,
|
|
260
|
+
"unique_texts": 1999
|
|
261
|
+
},
|
|
262
|
+
"queries_image_statistics": null,
|
|
263
|
+
"relevant_docs_statistics": {
|
|
264
|
+
"num_relevant_docs": 2000,
|
|
265
|
+
"min_relevant_docs_per_query": 1,
|
|
266
|
+
"average_relevant_docs_per_query": 1.0,
|
|
267
|
+
"max_relevant_docs_per_query": 1,
|
|
268
|
+
"unique_relevant_docs": 2000
|
|
269
|
+
},
|
|
270
|
+
"top_ranked_statistics": null
|
|
181
271
|
}
|
|
182
272
|
}
|
|
183
273
|
}
|