mteb 2.5.2__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +17 -18
- mteb/_evaluators/any_sts_evaluator.py +3 -3
- mteb/_evaluators/clustering_evaluator.py +2 -2
- mteb/_evaluators/evaluator.py +4 -2
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +10 -8
- mteb/_evaluators/pair_classification_evaluator.py +5 -3
- mteb/_evaluators/retrieval_evaluator.py +2 -2
- mteb/_evaluators/retrieval_metrics.py +18 -17
- mteb/_evaluators/sklearn_evaluator.py +11 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +27 -18
- mteb/_evaluators/text/summarization_evaluator.py +23 -18
- mteb/_evaluators/zeroshot_classification_evaluator.py +5 -3
- mteb/abstasks/_data_filter/filters.py +1 -1
- mteb/abstasks/_data_filter/task_pipelines.py +3 -0
- mteb/abstasks/_statistics_calculation.py +18 -10
- mteb/abstasks/_stratification.py +18 -18
- mteb/abstasks/abstask.py +35 -28
- mteb/abstasks/aggregate_task_metadata.py +1 -9
- mteb/abstasks/aggregated_task.py +10 -29
- mteb/abstasks/classification.py +15 -10
- mteb/abstasks/clustering.py +19 -15
- mteb/abstasks/clustering_legacy.py +10 -10
- mteb/abstasks/image/image_text_pair_classification.py +7 -4
- mteb/abstasks/multilabel_classification.py +23 -19
- mteb/abstasks/pair_classification.py +20 -11
- mteb/abstasks/regression.py +4 -4
- mteb/abstasks/retrieval.py +28 -24
- mteb/abstasks/retrieval_dataset_loaders.py +2 -2
- mteb/abstasks/sts.py +8 -5
- mteb/abstasks/task_metadata.py +31 -33
- mteb/abstasks/text/bitext_mining.py +39 -28
- mteb/abstasks/text/reranking.py +8 -6
- mteb/abstasks/text/summarization.py +10 -5
- mteb/abstasks/zeroshot_classification.py +8 -4
- mteb/benchmarks/benchmark.py +4 -2
- mteb/benchmarks/benchmarks/__init__.py +4 -0
- mteb/benchmarks/benchmarks/benchmarks.py +112 -11
- mteb/benchmarks/get_benchmark.py +14 -55
- mteb/cache.py +182 -29
- mteb/cli/_display_tasks.py +2 -2
- mteb/cli/build_cli.py +110 -14
- mteb/cli/generate_model_card.py +43 -23
- mteb/deprecated_evaluator.py +63 -49
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json +32 -0
- mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json +32 -0
- mteb/descriptive_stats/Retrieval/ChemRxivRetrieval.json +30 -0
- mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json +116 -0
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/evaluate.py +44 -33
- mteb/filter_tasks.py +25 -26
- mteb/get_tasks.py +29 -30
- mteb/languages/language_scripts.py +5 -3
- mteb/leaderboard/app.py +162 -34
- mteb/load_results.py +12 -12
- mteb/models/abs_encoder.py +10 -6
- mteb/models/cache_wrappers/cache_backend_protocol.py +3 -5
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +5 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +6 -2
- mteb/models/cache_wrappers/cache_backends/numpy_cache.py +43 -25
- mteb/models/cache_wrappers/cache_wrapper.py +2 -2
- mteb/models/get_model_meta.py +21 -3
- mteb/models/instruct_wrapper.py +28 -8
- mteb/models/model_implementations/align_models.py +1 -1
- mteb/models/model_implementations/andersborges.py +4 -4
- mteb/models/model_implementations/ara_models.py +1 -1
- mteb/models/model_implementations/arctic_models.py +8 -8
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +45 -21
- mteb/models/model_implementations/bica_model.py +3 -3
- mteb/models/model_implementations/blip2_models.py +2 -2
- mteb/models/model_implementations/blip_models.py +16 -16
- mteb/models/model_implementations/bm25.py +4 -4
- mteb/models/model_implementations/bmretriever_models.py +6 -4
- mteb/models/model_implementations/cadet_models.py +1 -1
- mteb/models/model_implementations/cde_models.py +11 -4
- mteb/models/model_implementations/clip_models.py +6 -6
- mteb/models/model_implementations/clips_models.py +3 -3
- mteb/models/model_implementations/codefuse_models.py +5 -5
- mteb/models/model_implementations/codesage_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +5 -5
- mteb/models/model_implementations/cohere_v.py +2 -2
- mteb/models/model_implementations/colpali_models.py +3 -3
- mteb/models/model_implementations/colqwen_models.py +8 -8
- mteb/models/model_implementations/colsmol_models.py +2 -2
- mteb/models/model_implementations/conan_models.py +1 -1
- mteb/models/model_implementations/dino_models.py +42 -42
- mteb/models/model_implementations/e5_instruct.py +23 -4
- mteb/models/model_implementations/e5_models.py +9 -9
- mteb/models/model_implementations/e5_v.py +6 -6
- mteb/models/model_implementations/eagerworks_models.py +1 -1
- mteb/models/model_implementations/emillykkejensen_models.py +6 -6
- mteb/models/model_implementations/en_code_retriever.py +1 -1
- mteb/models/model_implementations/euler_models.py +2 -2
- mteb/models/model_implementations/fa_models.py +9 -9
- mteb/models/model_implementations/facebookai.py +14 -2
- mteb/models/model_implementations/geogpt_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +6 -5
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
- mteb/models/model_implementations/gritlm_models.py +2 -2
- mteb/models/model_implementations/gte_models.py +25 -13
- mteb/models/model_implementations/hinvec_models.py +1 -1
- mteb/models/model_implementations/ibm_granite_models.py +30 -6
- mteb/models/model_implementations/inf_models.py +2 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_clip.py +48 -10
- mteb/models/model_implementations/jina_models.py +18 -11
- mteb/models/model_implementations/kblab.py +12 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +4 -4
- mteb/models/model_implementations/kfst.py +1 -1
- mteb/models/model_implementations/kowshik24_models.py +1 -1
- mteb/models/model_implementations/lgai_embedding_models.py +1 -1
- mteb/models/model_implementations/linq_models.py +1 -1
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +6 -6
- mteb/models/model_implementations/llm2vec_models.py +8 -8
- mteb/models/model_implementations/mcinext_models.py +4 -1
- mteb/models/model_implementations/mdbr_models.py +17 -3
- mteb/models/model_implementations/misc_models.py +68 -68
- mteb/models/model_implementations/mixedbread_ai_models.py +332 -0
- mteb/models/model_implementations/mme5_models.py +1 -1
- mteb/models/model_implementations/moco_models.py +4 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +14 -14
- mteb/models/model_implementations/moka_models.py +1 -1
- mteb/models/model_implementations/nbailab.py +3 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +2 -2
- mteb/models/model_implementations/nomic_models.py +30 -15
- mteb/models/model_implementations/nomic_models_vision.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +15 -9
- mteb/models/model_implementations/nvidia_models.py +151 -19
- mteb/models/model_implementations/octen_models.py +61 -2
- mteb/models/model_implementations/openclip_models.py +13 -13
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
- mteb/models/model_implementations/ops_moa_models.py +1 -1
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +1 -1
- mteb/models/model_implementations/piccolo_models.py +1 -1
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/pylate_models.py +10 -9
- mteb/models/model_implementations/qodo_models.py +2 -2
- mteb/models/model_implementations/qtack_models.py +1 -1
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +2 -2
- mteb/models/model_implementations/random_baseline.py +3 -3
- mteb/models/model_implementations/rasgaard_models.py +2 -2
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/repllama_models.py +3 -3
- mteb/models/model_implementations/rerankers_custom.py +12 -6
- mteb/models/model_implementations/rerankers_monot5_based.py +17 -17
- mteb/models/model_implementations/richinfoai_models.py +1 -1
- mteb/models/model_implementations/ru_sentence_models.py +20 -20
- mteb/models/model_implementations/ruri_models.py +10 -10
- mteb/models/model_implementations/salesforce_models.py +3 -3
- mteb/models/model_implementations/samilpwc_models.py +1 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
- mteb/models/model_implementations/searchmap_models.py +1 -1
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +113 -146
- mteb/models/model_implementations/sentence_transformers_models.py +124 -22
- mteb/models/model_implementations/shuu_model.py +1 -1
- mteb/models/model_implementations/siglip_models.py +20 -20
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
- mteb/models/model_implementations/stella_models.py +17 -4
- mteb/models/model_implementations/tarka_models.py +2 -2
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +1 -1
- mteb/models/model_implementations/uae_models.py +7 -1
- mteb/models/model_implementations/vdr_models.py +1 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -6
- mteb/models/model_implementations/vlm2vec_models.py +3 -3
- mteb/models/model_implementations/voyage_models.py +84 -0
- mteb/models/model_implementations/voyage_v.py +9 -7
- mteb/models/model_implementations/youtu_models.py +1 -1
- mteb/models/model_implementations/yuan_models.py +1 -1
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +80 -31
- mteb/models/models_protocols.py +22 -6
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +9 -6
- mteb/models/search_wrappers.py +33 -18
- mteb/models/sentence_transformer_wrapper.py +50 -25
- mteb/models/vllm_wrapper.py +327 -0
- mteb/py.typed +0 -0
- mteb/results/benchmark_results.py +29 -21
- mteb/results/model_result.py +52 -22
- mteb/results/task_result.py +80 -58
- mteb/similarity_functions.py +11 -7
- mteb/tasks/classification/dan/dk_hate_classification.py +1 -1
- mteb/tasks/classification/est/estonian_valence.py +1 -1
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/classification/multilingual/scala_classification.py +1 -1
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/image_text_pair_classification/eng/sugar_crepe.py +1 -1
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/code/code_rag.py +12 -12
- mteb/tasks/retrieval/dan/dan_fever_retrieval.py +1 -1
- mteb/tasks/retrieval/dan/tv2_nordretrieval.py +2 -2
- mteb/tasks/retrieval/dan/twitter_hjerne_retrieval.py +2 -2
- mteb/tasks/retrieval/eng/__init__.py +2 -0
- mteb/tasks/retrieval/eng/chemrxiv.py +33 -0
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/kor/__init__.py +15 -1
- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +142 -0
- mteb/tasks/retrieval/multilingual/__init__.py +2 -0
- mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py +43 -0
- mteb/tasks/retrieval/multilingual/vidore3_bench_retrieval.py +90 -100
- mteb/tasks/retrieval/nob/norquad.py +2 -2
- mteb/tasks/retrieval/nob/snl_retrieval.py +2 -2
- mteb/tasks/retrieval/tur/tur_hist_quad.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- mteb/types/__init__.py +2 -0
- mteb/types/_encoder_io.py +12 -0
- mteb/types/_result.py +2 -1
- mteb/types/statistics.py +9 -3
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/METADATA +15 -4
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/RECORD +240 -219
- mteb/models/model_implementations/mxbai_models.py +0 -111
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/WHEEL +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/entry_points.txt +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.5.2.dist-info → mteb-2.7.2.dist-info}/top_level.txt +0 -0
mteb/results/task_result.py
CHANGED
|
@@ -2,9 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
|
-
|
|
5
|
+
import warnings
|
|
6
6
|
from collections import defaultdict
|
|
7
|
-
from collections.abc import Callable, Iterable
|
|
7
|
+
from collections.abc import Callable, Iterable, Mapping
|
|
8
8
|
from functools import cached_property
|
|
9
9
|
from importlib.metadata import version
|
|
10
10
|
from pathlib import Path
|
|
@@ -16,8 +16,11 @@ from packaging.version import Version
|
|
|
16
16
|
from pydantic import BaseModel, field_validator
|
|
17
17
|
from typing_extensions import Self
|
|
18
18
|
|
|
19
|
+
from mteb import TaskMetadata
|
|
19
20
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
21
|
+
from mteb.abstasks import AbsTaskClassification
|
|
20
22
|
from mteb.abstasks.abstask import AbsTask
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskDomain
|
|
21
24
|
from mteb.languages import LanguageScripts
|
|
22
25
|
from mteb.models.model_meta import ScoringFunction
|
|
23
26
|
from mteb.types import (
|
|
@@ -39,67 +42,59 @@ class Criteria(HelpfulStrEnum):
|
|
|
39
42
|
DATASET_REVISION = "dataset_revision"
|
|
40
43
|
|
|
41
44
|
|
|
42
|
-
class ScalaNbClassificationDummy:
|
|
45
|
+
class ScalaNbClassificationDummy(AbsTaskClassification):
|
|
43
46
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
44
47
|
|
|
45
|
-
metadata =
|
|
48
|
+
metadata = TaskMetadata(
|
|
46
49
|
name="ScalaNbClassification",
|
|
50
|
+
description="A dummy",
|
|
47
51
|
main_score="accuracy",
|
|
48
52
|
type="Classification",
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
},
|
|
52
|
-
dataset={"revision": "revision_not_applicable"},
|
|
53
|
-
revision="revision_not_applicable",
|
|
53
|
+
eval_langs=["nob-Latn"],
|
|
54
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
54
55
|
)
|
|
55
56
|
|
|
56
57
|
|
|
57
|
-
class ScalaNnClassificationDummy:
|
|
58
|
+
class ScalaNnClassificationDummy(AbsTaskClassification):
|
|
58
59
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
59
60
|
|
|
60
|
-
metadata =
|
|
61
|
+
metadata = TaskMetadata(
|
|
61
62
|
name="ScalaNnClassification",
|
|
63
|
+
description="A dummy",
|
|
62
64
|
main_score="accuracy",
|
|
63
65
|
type="Classification",
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
},
|
|
67
|
-
dataset={"revision": "revision_not_applicable"},
|
|
68
|
-
revision="revision_not_applicable",
|
|
66
|
+
eval_langs=["nob-Latn"],
|
|
67
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
69
68
|
)
|
|
70
69
|
|
|
71
70
|
|
|
72
|
-
class ScalaDaClassificationDummy:
|
|
71
|
+
class ScalaDaClassificationDummy(AbsTaskClassification):
|
|
73
72
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
74
73
|
|
|
75
|
-
metadata =
|
|
74
|
+
metadata = TaskMetadata(
|
|
76
75
|
name="ScalaDaClassification",
|
|
76
|
+
description="A dummy",
|
|
77
77
|
main_score="accuracy",
|
|
78
78
|
type="Classification",
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
},
|
|
82
|
-
dataset={"revision": "revision_not_applicable"},
|
|
83
|
-
revision="revision_not_applicable",
|
|
79
|
+
eval_langs=["dan-Latn"],
|
|
80
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
84
81
|
)
|
|
85
82
|
|
|
86
83
|
|
|
87
|
-
class ScalaSvClassificationDummy:
|
|
84
|
+
class ScalaSvClassificationDummy(AbsTaskClassification):
|
|
88
85
|
"""A dummy task for loading historic results from before v1.11.0"""
|
|
89
86
|
|
|
90
|
-
metadata =
|
|
87
|
+
metadata = TaskMetadata(
|
|
91
88
|
name="ScalaSvClassification",
|
|
89
|
+
description="A dummy",
|
|
92
90
|
main_score="accuracy",
|
|
93
91
|
type="Classification",
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
},
|
|
97
|
-
dataset={"revision": "revision_not_applicable"},
|
|
98
|
-
revision="revision_not_applicable",
|
|
92
|
+
eval_langs=["swe-Latn"],
|
|
93
|
+
dataset={"path": "not/exists", "revision": "revision_not_applicable"},
|
|
99
94
|
)
|
|
100
95
|
|
|
101
96
|
|
|
102
|
-
outdated_tasks = {
|
|
97
|
+
outdated_tasks: dict[str, type[AbsTask]] = {
|
|
103
98
|
"ScalaNbClassification": ScalaNbClassificationDummy,
|
|
104
99
|
"ScalaNnClassification": ScalaNnClassificationDummy,
|
|
105
100
|
"ScalaDaClassification": ScalaDaClassificationDummy,
|
|
@@ -166,10 +161,10 @@ class TaskResult(BaseModel):
|
|
|
166
161
|
def from_task_results(
|
|
167
162
|
cls,
|
|
168
163
|
task: AbsTask | type[AbsTask],
|
|
169
|
-
scores: dict[SplitName,
|
|
164
|
+
scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
|
|
170
165
|
evaluation_time: float,
|
|
171
166
|
kg_co2_emissions: float | None = None,
|
|
172
|
-
) ->
|
|
167
|
+
) -> TaskResult:
|
|
173
168
|
"""Create a TaskResult from the task and scores.
|
|
174
169
|
|
|
175
170
|
Args:
|
|
@@ -246,12 +241,12 @@ class TaskResult(BaseModel):
|
|
|
246
241
|
return get_task(self.task_name)
|
|
247
242
|
|
|
248
243
|
@property
|
|
249
|
-
def domains(self) -> list[
|
|
244
|
+
def domains(self) -> list[TaskDomain]:
|
|
250
245
|
"""Get the domains of the task."""
|
|
251
246
|
doms = self.task.metadata.domains
|
|
252
247
|
if doms is None:
|
|
253
248
|
doms = []
|
|
254
|
-
return doms
|
|
249
|
+
return doms
|
|
255
250
|
|
|
256
251
|
@property
|
|
257
252
|
def task_type(self) -> str:
|
|
@@ -307,7 +302,7 @@ class TaskResult(BaseModel):
|
|
|
307
302
|
if isinstance(v, dict):
|
|
308
303
|
self._round_scores(v, n)
|
|
309
304
|
elif isinstance(v, float):
|
|
310
|
-
value[i] = round(v, n)
|
|
305
|
+
value[i] = round(v, n) # type: ignore[call-overload]
|
|
311
306
|
|
|
312
307
|
elif isinstance(value, float):
|
|
313
308
|
scores[key] = round(value, n)
|
|
@@ -325,7 +320,7 @@ class TaskResult(BaseModel):
|
|
|
325
320
|
json.dump(json_obj, f, indent=2)
|
|
326
321
|
|
|
327
322
|
@classmethod
|
|
328
|
-
def from_disk(cls, path: Path, load_historic_data: bool = True) ->
|
|
323
|
+
def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
|
|
329
324
|
"""Load TaskResult from disk.
|
|
330
325
|
|
|
331
326
|
Args:
|
|
@@ -356,7 +351,7 @@ class TaskResult(BaseModel):
|
|
|
356
351
|
) # assume it is before 1.11.0 if the version is not present
|
|
357
352
|
|
|
358
353
|
try:
|
|
359
|
-
obj = cls.model_validate(data)
|
|
354
|
+
obj: TaskResult = cls.model_validate(data)
|
|
360
355
|
except Exception as e:
|
|
361
356
|
if not pre_1_11_load:
|
|
362
357
|
raise e
|
|
@@ -381,6 +376,7 @@ class TaskResult(BaseModel):
|
|
|
381
376
|
from mteb import get_task
|
|
382
377
|
|
|
383
378
|
task_name = obj.task_name
|
|
379
|
+
task: AbsTask | type[AbsTask]
|
|
384
380
|
if task_name in outdated_tasks:
|
|
385
381
|
task = outdated_tasks[task_name]
|
|
386
382
|
else:
|
|
@@ -393,11 +389,11 @@ class TaskResult(BaseModel):
|
|
|
393
389
|
for key in list(hf_subset_scores.keys()):
|
|
394
390
|
if isinstance(hf_subset_scores[key], dict):
|
|
395
391
|
for k, v in hf_subset_scores[key].items():
|
|
396
|
-
hf_subset_scores[f"{key}_{k}"] = v
|
|
397
|
-
hf_subset_scores.pop(key)
|
|
392
|
+
hf_subset_scores[f"{key}_{k}"] = v # type: ignore[index]
|
|
393
|
+
hf_subset_scores.pop(key) # type: ignore[attr-defined]
|
|
398
394
|
|
|
399
395
|
@classmethod
|
|
400
|
-
def _convert_from_before_v1_11_0(cls, data: dict) ->
|
|
396
|
+
def _convert_from_before_v1_11_0(cls, data: dict) -> TaskResult:
|
|
401
397
|
from mteb.get_tasks import _TASKS_REGISTRY
|
|
402
398
|
|
|
403
399
|
# in case the task name is not found in the registry, try to find a lower case version
|
|
@@ -462,7 +458,9 @@ class TaskResult(BaseModel):
|
|
|
462
458
|
if main_score in hf_subset_scores:
|
|
463
459
|
hf_subset_scores["main_score"] = hf_subset_scores[main_score]
|
|
464
460
|
else:
|
|
465
|
-
|
|
461
|
+
msg = f"Main score {main_score} not found in scores"
|
|
462
|
+
logger.warning(msg)
|
|
463
|
+
warnings.warn(msg)
|
|
466
464
|
hf_subset_scores["main_score"] = None
|
|
467
465
|
|
|
468
466
|
# specific fixes:
|
|
@@ -481,7 +479,7 @@ class TaskResult(BaseModel):
|
|
|
481
479
|
scores["test"]["fra-fra"] = scores["test"].pop("fr")
|
|
482
480
|
|
|
483
481
|
result: TaskResult = TaskResult.from_task_results(
|
|
484
|
-
task,
|
|
482
|
+
task,
|
|
485
483
|
scores,
|
|
486
484
|
evaluation_time,
|
|
487
485
|
kg_co2_emissions=None,
|
|
@@ -532,7 +530,7 @@ class TaskResult(BaseModel):
|
|
|
532
530
|
def _get_score_fast(
|
|
533
531
|
self,
|
|
534
532
|
splits: Iterable[str] | None = None,
|
|
535
|
-
languages:
|
|
533
|
+
languages: list[ISOLanguage | ISOLanguageScript] | None = None,
|
|
536
534
|
subsets: Iterable[str] | None = None,
|
|
537
535
|
) -> float:
|
|
538
536
|
"""Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.
|
|
@@ -581,7 +579,7 @@ class TaskResult(BaseModel):
|
|
|
581
579
|
return val_sum / n_val
|
|
582
580
|
|
|
583
581
|
@classmethod
|
|
584
|
-
def from_validated(cls, **data) ->
|
|
582
|
+
def from_validated(cls, **data) -> TaskResult:
|
|
585
583
|
"""Create a TaskResult from validated data.
|
|
586
584
|
|
|
587
585
|
Returns:
|
|
@@ -592,13 +590,13 @@ class TaskResult(BaseModel):
|
|
|
592
590
|
def __repr__(self) -> str:
|
|
593
591
|
return f"TaskResult(task_name={self.task_name}, scores=...)"
|
|
594
592
|
|
|
595
|
-
def only_main_score(self) ->
|
|
593
|
+
def only_main_score(self) -> TaskResult:
|
|
596
594
|
"""Return a new TaskResult object with only the main score.
|
|
597
595
|
|
|
598
596
|
Returns:
|
|
599
597
|
A new TaskResult object with only the main score.
|
|
600
598
|
"""
|
|
601
|
-
new_scores = {}
|
|
599
|
+
new_scores: dict[str, list[Score]] = {}
|
|
602
600
|
for split in self.scores:
|
|
603
601
|
new_scores[split] = []
|
|
604
602
|
for subset_scores in self.scores[split]:
|
|
@@ -610,10 +608,12 @@ class TaskResult(BaseModel):
|
|
|
610
608
|
}
|
|
611
609
|
)
|
|
612
610
|
new_res = {**self.to_dict(), "scores": new_scores}
|
|
613
|
-
|
|
614
|
-
return new_res
|
|
611
|
+
return TaskResult.from_validated(**new_res)
|
|
615
612
|
|
|
616
|
-
def validate_and_filter_scores(
|
|
613
|
+
def validate_and_filter_scores(
|
|
614
|
+
self,
|
|
615
|
+
task: AbsTask | None = None,
|
|
616
|
+
) -> TaskResult:
|
|
617
617
|
"""Validate and filter the scores against the task metadata.
|
|
618
618
|
|
|
619
619
|
This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
|
|
@@ -635,7 +635,7 @@ class TaskResult(BaseModel):
|
|
|
635
635
|
splits = task.eval_splits
|
|
636
636
|
hf_subsets = set(task.hf_subsets) # Convert to set once
|
|
637
637
|
|
|
638
|
-
new_scores = {}
|
|
638
|
+
new_scores: dict[str, list[Score]] = {}
|
|
639
639
|
seen_splits = set()
|
|
640
640
|
for split in self.scores:
|
|
641
641
|
if split not in splits:
|
|
@@ -658,14 +658,36 @@ class TaskResult(BaseModel):
|
|
|
658
658
|
else:
|
|
659
659
|
missing_subsets_str = str(missing_subsets)
|
|
660
660
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
)
|
|
661
|
+
msg = f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
|
|
662
|
+
logger.warning(msg)
|
|
663
|
+
warnings.warn(msg)
|
|
664
|
+
for missing_subset in missing_subsets:
|
|
665
|
+
new_scores[split].append(
|
|
666
|
+
{
|
|
667
|
+
"hf_subset": missing_subset,
|
|
668
|
+
"main_score": np.nan,
|
|
669
|
+
"languages": task.metadata.hf_subsets_to_langscripts.get(
|
|
670
|
+
missing_subset, []
|
|
671
|
+
),
|
|
672
|
+
}
|
|
673
|
+
)
|
|
664
674
|
seen_splits.add(split)
|
|
665
675
|
if seen_splits != set(splits):
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
)
|
|
676
|
+
msg = f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
|
|
677
|
+
logger.warning(msg)
|
|
678
|
+
warnings.warn(msg)
|
|
679
|
+
for missing_split in set(splits) - seen_splits:
|
|
680
|
+
new_scores[missing_split] = []
|
|
681
|
+
for missing_subset in hf_subsets:
|
|
682
|
+
new_scores[missing_split].append(
|
|
683
|
+
{
|
|
684
|
+
"hf_subset": missing_subset,
|
|
685
|
+
"main_score": np.nan,
|
|
686
|
+
"languages": task.metadata.hf_subsets_to_langscripts.get(
|
|
687
|
+
missing_subset, []
|
|
688
|
+
),
|
|
689
|
+
}
|
|
690
|
+
)
|
|
669
691
|
data = self.model_dump()
|
|
670
692
|
data["scores"] = new_scores
|
|
671
693
|
return type(self).model_construct(**data)
|
|
@@ -736,7 +758,7 @@ class TaskResult(BaseModel):
|
|
|
736
758
|
"mteb_version",
|
|
737
759
|
"dataset_revision",
|
|
738
760
|
],
|
|
739
|
-
) ->
|
|
761
|
+
) -> TaskResult:
|
|
740
762
|
"""Merges two TaskResult objects.
|
|
741
763
|
|
|
742
764
|
Args:
|
mteb/similarity_functions.py
CHANGED
|
@@ -186,7 +186,7 @@ def max_sim(a: Array, b: Array) -> torch.Tensor:
|
|
|
186
186
|
b,
|
|
187
187
|
)
|
|
188
188
|
|
|
189
|
-
return scores.max(axis=-1).values.sum(axis=-1)
|
|
189
|
+
return scores.max(axis=-1).values.sum(axis=-1) # type: ignore[call-overload]
|
|
190
190
|
|
|
191
191
|
|
|
192
192
|
# https://github.com/lightonai/pylate/blob/2d094a724866d6e15701781528368438081c0157/pylate/scores/scores.py#L67C1-L122C38
|
|
@@ -217,7 +217,7 @@ def pairwise_max_sim(
|
|
|
217
217
|
document_embedding,
|
|
218
218
|
)
|
|
219
219
|
|
|
220
|
-
scores.append(query_document_score.max(axis=-1).values.sum())
|
|
220
|
+
scores.append(query_document_score.max(axis=-1).values.sum()) # type: ignore[call-overload]
|
|
221
221
|
|
|
222
222
|
return torch.stack(scores, dim=0)
|
|
223
223
|
|
|
@@ -317,11 +317,15 @@ def similarity(text_embeddings: Array, input_embeddings: Array) -> Array:
|
|
|
317
317
|
Returns:
|
|
318
318
|
Matrix with similarities
|
|
319
319
|
"""
|
|
320
|
-
|
|
321
|
-
|
|
320
|
+
text_embeddings_tensor = _convert_to_tensor(text_embeddings)
|
|
321
|
+
input_embeddings_tensor = _convert_to_tensor(input_embeddings)
|
|
322
322
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
323
|
+
text_embeddings_tensor = text_embeddings_tensor / text_embeddings_tensor.norm(
|
|
324
|
+
dim=-1, keepdim=True
|
|
325
|
+
)
|
|
326
|
+
input_embeddings_tensor = input_embeddings_tensor / input_embeddings_tensor.norm(
|
|
327
|
+
dim=-1, keepdim=True
|
|
328
|
+
)
|
|
329
|
+
logits = torch.matmul(input_embeddings_tensor, text_embeddings_tensor.T)
|
|
326
330
|
probs = (logits * 100).softmax(dim=-1)
|
|
327
331
|
return probs
|
|
@@ -62,7 +62,7 @@ Piperidis, Stelios},
|
|
|
62
62
|
|
|
63
63
|
def dataset_transform(self):
|
|
64
64
|
# convert label to a 0/1 label
|
|
65
|
-
labels = self.dataset["train"]["label"]
|
|
65
|
+
labels = self.dataset["train"]["label"]
|
|
66
66
|
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
|
|
67
67
|
self.dataset = self.dataset.map(
|
|
68
68
|
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
|
|
@@ -45,7 +45,7 @@ class EstonianValenceClassification(AbsTaskClassification):
|
|
|
45
45
|
"valence", "label"
|
|
46
46
|
)
|
|
47
47
|
# convert label to a numbers
|
|
48
|
-
labels = self.dataset["train"]["label"]
|
|
48
|
+
labels = self.dataset["train"]["label"]
|
|
49
49
|
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
|
|
50
50
|
self.dataset = self.dataset.map(
|
|
51
51
|
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
|
|
@@ -25,7 +25,7 @@ class KurdishSentimentClassification(AbsTaskClassification):
|
|
|
25
25
|
dialect=["Sorani"],
|
|
26
26
|
sample_creation="found",
|
|
27
27
|
bibtex_citation=r"""
|
|
28
|
-
@article{
|
|
28
|
+
@article{badawi2024kurdisent,
|
|
29
29
|
author = {Badawi, Soran and Kazemi, Arefeh and Rezaie, Vali},
|
|
30
30
|
doi = {10.1007/s10579-023-09716-6},
|
|
31
31
|
journal = {Language Resources and Evaluation},
|
|
@@ -62,7 +62,7 @@ class KurdishSentimentClassificationV2(AbsTaskClassification):
|
|
|
62
62
|
dialect=["Sorani"],
|
|
63
63
|
sample_creation="found",
|
|
64
64
|
bibtex_citation=r"""
|
|
65
|
-
@article{
|
|
65
|
+
@article{badawi2024kurdisent,
|
|
66
66
|
author = {Badawi, Soran and Kazemi, Arefeh and Rezaie, Vali},
|
|
67
67
|
doi = {10.1007/s10579-023-09716-6},
|
|
68
68
|
journal = {Language Resources and Evaluation},
|
|
@@ -57,7 +57,7 @@ Fishel, Mark},
|
|
|
57
57
|
def dataset_transform(self):
|
|
58
58
|
for lang in self.dataset.keys():
|
|
59
59
|
# convert label to a 0/1 label
|
|
60
|
-
labels = self.dataset[lang]["train"]["label"]
|
|
60
|
+
labels = self.dataset[lang]["train"]["label"]
|
|
61
61
|
lab2idx = {lab: idx for idx, lab in enumerate(set(labels))}
|
|
62
62
|
self.dataset[lang] = self.dataset[lang].map(
|
|
63
63
|
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
|
|
@@ -25,7 +25,7 @@ class HUMEWikiCitiesClustering(AbsTaskClusteringLegacy):
|
|
|
25
25
|
dialect=[],
|
|
26
26
|
sample_creation="found",
|
|
27
27
|
bibtex_citation=r"""
|
|
28
|
-
@online{
|
|
28
|
+
@online{wikidump2024,
|
|
29
29
|
author = {Wikimedia Foundation},
|
|
30
30
|
title = {Wikimedia Downloads},
|
|
31
31
|
url = {https://dumps.wikimedia.org},
|
|
@@ -25,7 +25,7 @@ class WikiCitiesClustering(AbsTaskClusteringLegacy):
|
|
|
25
25
|
dialect=[],
|
|
26
26
|
sample_creation="found",
|
|
27
27
|
bibtex_citation=r"""
|
|
28
|
-
@online{
|
|
28
|
+
@online{wikidump2024,
|
|
29
29
|
author = {Wikimedia Foundation},
|
|
30
30
|
title = {Wikimedia Downloads},
|
|
31
31
|
url = {https://dumps.wikimedia.org},
|
|
@@ -226,7 +226,7 @@ class ThuNewsClusteringFastS2S(AbsTaskClustering):
|
|
|
226
226
|
dialect=[],
|
|
227
227
|
sample_creation="found",
|
|
228
228
|
bibtex_citation=r"""
|
|
229
|
-
@software{
|
|
229
|
+
@software{sun2016thuctc,
|
|
230
230
|
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
|
|
231
231
|
note = {THU Chinese Text Classification Toolkit},
|
|
232
232
|
publisher = {THU Natural Language Processing Lab},
|
|
@@ -285,7 +285,7 @@ class ThuNewsClusteringFastP2P(AbsTaskClustering):
|
|
|
285
285
|
dialect=[],
|
|
286
286
|
sample_creation="found",
|
|
287
287
|
bibtex_citation=r"""
|
|
288
|
-
@software{
|
|
288
|
+
@software{sun2016thuctc,
|
|
289
289
|
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
|
|
290
290
|
note = {THU Chinese Text Classification Toolkit},
|
|
291
291
|
publisher = {THU Natural Language Processing Lab},
|
|
@@ -49,7 +49,7 @@ class SugarCrepe(AbsTaskImageTextPairClassification):
|
|
|
49
49
|
"""Load dataset from HuggingFace hub"""
|
|
50
50
|
if self.data_loaded:
|
|
51
51
|
return
|
|
52
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
52
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
53
53
|
self.dataset = datasets.DatasetDict({"test": self.dataset["train"]})
|
|
54
54
|
self.dataset_transform()
|
|
55
55
|
self.data_loaded = True
|
|
@@ -44,7 +44,7 @@ class WikipediaRerankingMultilingual(AbsTaskRetrieval):
|
|
|
44
44
|
dialect=[],
|
|
45
45
|
sample_creation="LM-generated and verified",
|
|
46
46
|
bibtex_citation=r"""
|
|
47
|
-
@online{
|
|
47
|
+
@online{wikidump2024,
|
|
48
48
|
author = {Wikimedia Foundation},
|
|
49
49
|
title = {Wikimedia Downloads},
|
|
50
50
|
url = {https://dumps.wikimedia.org},
|
|
@@ -48,14 +48,14 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
48
48
|
"path": "code-rag-bench/programming-solutions",
|
|
49
49
|
"revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6",
|
|
50
50
|
},
|
|
51
|
-
**common_args,
|
|
51
|
+
**common_args,
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
def load_data(self) -> None:
|
|
55
55
|
"""Load dataset from HuggingFace hub"""
|
|
56
56
|
if self.data_loaded:
|
|
57
57
|
return
|
|
58
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
58
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
59
59
|
self.dataset_transform()
|
|
60
60
|
self.data_loaded = True
|
|
61
61
|
|
|
@@ -71,7 +71,7 @@ class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
71
71
|
self.queries = {}
|
|
72
72
|
|
|
73
73
|
split = self.metadata.eval_splits[0]
|
|
74
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
74
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
75
75
|
ds = ds.shuffle(seed=42)
|
|
76
76
|
|
|
77
77
|
self.queries[split] = {}
|
|
@@ -105,14 +105,14 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
|
|
|
105
105
|
"path": "code-rag-bench/online-tutorials",
|
|
106
106
|
"revision": "095bb77130082e4690d6c3a031997b03487bf6e2",
|
|
107
107
|
},
|
|
108
|
-
**common_args,
|
|
108
|
+
**common_args,
|
|
109
109
|
)
|
|
110
110
|
|
|
111
111
|
def load_data(self) -> None:
|
|
112
112
|
"""Load dataset from HuggingFace hub"""
|
|
113
113
|
if self.data_loaded:
|
|
114
114
|
return
|
|
115
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
115
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
116
116
|
self.dataset_transform()
|
|
117
117
|
self.data_loaded = True
|
|
118
118
|
|
|
@@ -128,7 +128,7 @@ class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval):
|
|
|
128
128
|
self.queries = {}
|
|
129
129
|
|
|
130
130
|
split = self.metadata.eval_splits[0]
|
|
131
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
131
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
132
132
|
ds = ds.shuffle(seed=42)
|
|
133
133
|
|
|
134
134
|
self.queries[split] = {}
|
|
@@ -165,14 +165,14 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
165
165
|
"path": "code-rag-bench/library-documentation",
|
|
166
166
|
"revision": "b530d3b5a25087d2074e731b76232db85b9e9107",
|
|
167
167
|
},
|
|
168
|
-
**common_args,
|
|
168
|
+
**common_args,
|
|
169
169
|
)
|
|
170
170
|
|
|
171
171
|
def load_data(self) -> None:
|
|
172
172
|
"""Load dataset from HuggingFace hub"""
|
|
173
173
|
if self.data_loaded:
|
|
174
174
|
return
|
|
175
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
175
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
176
176
|
self.dataset_transform()
|
|
177
177
|
self.data_loaded = True
|
|
178
178
|
|
|
@@ -188,7 +188,7 @@ class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval):
|
|
|
188
188
|
self.queries = {}
|
|
189
189
|
|
|
190
190
|
split = self.metadata.eval_splits[0]
|
|
191
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
191
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
192
192
|
ds = ds.shuffle(seed=42)
|
|
193
193
|
|
|
194
194
|
self.queries[split] = {}
|
|
@@ -222,14 +222,14 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
|
|
|
222
222
|
"path": "code-rag-bench/stackoverflow-posts",
|
|
223
223
|
"revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4",
|
|
224
224
|
},
|
|
225
|
-
**common_args,
|
|
225
|
+
**common_args,
|
|
226
226
|
)
|
|
227
227
|
|
|
228
228
|
def load_data(self) -> None:
|
|
229
229
|
"""Load dataset from HuggingFace hub"""
|
|
230
230
|
if self.data_loaded:
|
|
231
231
|
return
|
|
232
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
232
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
233
233
|
self.dataset_transform()
|
|
234
234
|
self.data_loaded = True
|
|
235
235
|
|
|
@@ -245,7 +245,7 @@ class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval):
|
|
|
245
245
|
self.queries = {}
|
|
246
246
|
|
|
247
247
|
split = self.metadata.eval_splits[0]
|
|
248
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
248
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
249
249
|
ds = ds.shuffle(seed=42)
|
|
250
250
|
|
|
251
251
|
self.queries[split] = {}
|
|
@@ -51,7 +51,7 @@ Derczynski, Leon},
|
|
|
51
51
|
"""Load dataset from HuggingFace hub"""
|
|
52
52
|
if self.data_loaded:
|
|
53
53
|
return
|
|
54
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
54
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
55
55
|
self.dataset_transform()
|
|
56
56
|
self.data_loaded = True
|
|
57
57
|
|
|
@@ -64,7 +64,7 @@ Piperidis, Stelios},
|
|
|
64
64
|
"""Load dataset from HuggingFace hub"""
|
|
65
65
|
if self.data_loaded:
|
|
66
66
|
return
|
|
67
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
67
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
68
68
|
self.dataset_transform()
|
|
69
69
|
self.data_loaded = True
|
|
70
70
|
|
|
@@ -81,7 +81,7 @@ Piperidis, Stelios},
|
|
|
81
81
|
text2id = {}
|
|
82
82
|
|
|
83
83
|
for split in self.dataset:
|
|
84
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
84
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
85
85
|
ds = ds.shuffle(seed=42)
|
|
86
86
|
ds = ds.select(
|
|
87
87
|
range(2048)
|
|
@@ -40,7 +40,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
|
|
|
40
40
|
"""Load dataset from HuggingFace hub"""
|
|
41
41
|
if self.data_loaded:
|
|
42
42
|
return
|
|
43
|
-
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
43
|
+
self.dataset = datasets.load_dataset(**self.metadata.dataset)
|
|
44
44
|
self.dataset_transform()
|
|
45
45
|
self.data_loaded = True
|
|
46
46
|
|
|
@@ -57,7 +57,7 @@ class TwitterHjerneRetrieval(AbsTaskRetrieval):
|
|
|
57
57
|
text2id = {}
|
|
58
58
|
|
|
59
59
|
for split in self.dataset:
|
|
60
|
-
ds: datasets.Dataset = self.dataset[split]
|
|
60
|
+
ds: datasets.Dataset = self.dataset[split]
|
|
61
61
|
ds = ds.map(answers_to_list)
|
|
62
62
|
|
|
63
63
|
self.queries[split] = {}
|
|
@@ -18,6 +18,7 @@ from .built_bench_retrieval import BuiltBenchRetrieval
|
|
|
18
18
|
from .chat_doctor_retrieval import ChatDoctorRetrieval
|
|
19
19
|
from .chem_hotpot_qa_retrieval import ChemHotpotQARetrieval
|
|
20
20
|
from .chem_nq_retrieval import ChemNQRetrieval
|
|
21
|
+
from .chemrxiv import ChemRxivRetrieval
|
|
21
22
|
from .cirr_it2i_retrieval import CIRRIT2IRetrieval
|
|
22
23
|
from .climate_fever_retrieval import (
|
|
23
24
|
ClimateFEVER,
|
|
@@ -254,6 +255,7 @@ __all__ = [
|
|
|
254
255
|
"ChatDoctorRetrieval",
|
|
255
256
|
"ChemHotpotQARetrieval",
|
|
256
257
|
"ChemNQRetrieval",
|
|
258
|
+
"ChemRxivRetrieval",
|
|
257
259
|
"ClimateFEVER",
|
|
258
260
|
"ClimateFEVERHardNegatives",
|
|
259
261
|
"ClimateFEVERHardNegativesV2",
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from mteb.abstasks.retrieval import AbsTaskRetrieval
|
|
2
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ChemRxivRetrieval(AbsTaskRetrieval):
|
|
6
|
+
metadata = TaskMetadata(
|
|
7
|
+
name="ChemRxivRetrieval",
|
|
8
|
+
dataset={
|
|
9
|
+
"path": "BASF-AI/ChemRxivRetrieval",
|
|
10
|
+
"revision": "5377aa18f309ec440ff6325a4c2cd3362c2cb8d7",
|
|
11
|
+
},
|
|
12
|
+
description="A retrieval task based on ChemRxiv papers where queries are LLM-synthesized to match specific paragraphs.",
|
|
13
|
+
reference="https://arxiv.org/abs/2508.01643",
|
|
14
|
+
type="Retrieval",
|
|
15
|
+
category="t2t",
|
|
16
|
+
modalities=["text"],
|
|
17
|
+
eval_splits=["test"],
|
|
18
|
+
eval_langs=["eng-Latn"],
|
|
19
|
+
main_score="ndcg_at_10",
|
|
20
|
+
date=("2025-01-01", "2025-05-01"),
|
|
21
|
+
domains=["Chemistry"],
|
|
22
|
+
task_subtypes=["Question answering", "Article retrieval"],
|
|
23
|
+
license="cc-by-nc-sa-4.0",
|
|
24
|
+
annotations_creators="LM-generated and reviewed",
|
|
25
|
+
dialect=[],
|
|
26
|
+
sample_creation="found",
|
|
27
|
+
bibtex_citation="""@article{kasmaee2025chembed,
|
|
28
|
+
author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Astaraki, Mahdi and Saloot, Mohammad Arshi and Sherck, Nicholas and Mahyar, Hamidreza and Samiee, Soheila},
|
|
29
|
+
journal = {arXiv preprint arXiv:2508.01643},
|
|
30
|
+
title = {Chembed: Enhancing chemical literature search through domain-specific text embeddings},
|
|
31
|
+
year = {2025},
|
|
32
|
+
}""",
|
|
33
|
+
)
|