mteb 2.7.2__py3-none-any.whl → 2.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +16 -9
- mteb/_evaluators/any_sts_evaluator.py +10 -5
- mteb/_evaluators/clustering_evaluator.py +10 -4
- mteb/_evaluators/evaluator.py +9 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
- mteb/_evaluators/pair_classification_evaluator.py +10 -5
- mteb/_evaluators/retrieval_evaluator.py +19 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +14 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
- mteb/_evaluators/text/summarization_evaluator.py +8 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +17 -9
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +15 -6
- mteb/abstasks/clustering.py +17 -8
- mteb/abstasks/clustering_legacy.py +14 -6
- mteb/abstasks/image/image_text_pair_classification.py +17 -7
- mteb/abstasks/multilabel_classification.py +11 -5
- mteb/abstasks/pair_classification.py +19 -9
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +28 -17
- mteb/abstasks/retrieval_dataset_loaders.py +11 -8
- mteb/abstasks/sts.py +19 -10
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +14 -7
- mteb/abstasks/text/summarization.py +17 -7
- mteb/abstasks/zeroshot_classification.py +15 -7
- mteb/benchmarks/_create_table.py +13 -3
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +41 -2
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +10 -5
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/evaluate.py +20 -18
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +7 -3
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +11 -4
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +10 -4
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +40 -1
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +11 -4
- mteb/models/model_implementations/blip_models.py +17 -4
- mteb/models/model_implementations/bm25.py +22 -14
- mteb/models/model_implementations/bmretriever_models.py +10 -2
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +11 -5
- mteb/models/model_implementations/clip_models.py +12 -4
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +5 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +14 -4
- mteb/models/model_implementations/cohere_v.py +14 -4
- mteb/models/model_implementations/colpali_models.py +7 -3
- mteb/models/model_implementations/colqwen_models.py +17 -31
- mteb/models/model_implementations/colsmol_models.py +3 -1
- mteb/models/model_implementations/conan_models.py +11 -4
- mteb/models/model_implementations/dino_models.py +28 -4
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +10 -4
- mteb/models/model_implementations/eagerworks_models.py +11 -4
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +9 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +7 -3
- mteb/models/model_implementations/google_models.py +15 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +6 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +14 -5
- mteb/models/model_implementations/jina_clip.py +10 -4
- mteb/models/model_implementations/jina_models.py +17 -5
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +7 -1
- mteb/models/model_implementations/listconranker.py +10 -4
- mteb/models/model_implementations/llm2clip_models.py +12 -4
- mteb/models/model_implementations/llm2vec_models.py +20 -6
- mteb/models/model_implementations/mcinext_models.py +8 -2
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +11 -4
- mteb/models/model_implementations/mod_models.py +2 -1
- mteb/models/model_implementations/model2vec_models.py +23 -4
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
- mteb/models/model_implementations/nomic_models.py +16 -4
- mteb/models/model_implementations/nomic_models_vision.py +5 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
- mteb/models/model_implementations/nvidia_models.py +15 -4
- mteb/models/model_implementations/octen_models.py +3 -1
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +17 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
- mteb/models/model_implementations/ops_moa_models.py +9 -2
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +16 -6
- mteb/models/model_implementations/pylate_models.py +22 -13
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +11 -1
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +65 -0
- mteb/models/model_implementations/repllama_models.py +15 -6
- mteb/models/model_implementations/rerankers_custom.py +13 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +10 -1
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +1 -0
- mteb/models/model_implementations/siglip_models.py +19 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/text2vec_models.py +3 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +10 -4
- mteb/models/model_implementations/vdr_models.py +8 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +11 -4
- mteb/models/model_implementations/voyage_models.py +25 -4
- mteb/models/model_implementations/voyage_v.py +11 -6
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +2 -1
- mteb/models/model_meta.py +47 -9
- mteb/models/models_protocols.py +19 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +19 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +15 -9
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/clustering/nob/snl_clustering.py +7 -2
- mteb/tasks/clustering/nob/vg_clustering.py +7 -2
- mteb/tasks/retrieval/eng/__init__.py +42 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +9 -1
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/METADATA +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/RECORD +238 -217
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/WHEEL +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/top_level.txt +0 -0
|
@@ -7,19 +7,20 @@ from typing import TYPE_CHECKING, Any
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import torch
|
|
9
9
|
from packaging.version import Version
|
|
10
|
-
from torch.utils.data import DataLoader
|
|
11
|
-
from typing_extensions import Unpack
|
|
12
10
|
|
|
13
11
|
from mteb._log_once import LogOnce
|
|
14
12
|
from mteb.models import ModelMeta
|
|
15
|
-
from mteb.types import
|
|
13
|
+
from mteb.types import PromptType
|
|
16
14
|
|
|
17
15
|
from .abs_encoder import AbsEncoder
|
|
18
16
|
|
|
19
17
|
if TYPE_CHECKING:
|
|
20
18
|
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
19
|
+
from torch.utils.data import DataLoader
|
|
20
|
+
from typing_extensions import Unpack
|
|
21
21
|
|
|
22
22
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
23
|
+
from mteb.types import Array, BatchedInput, EncodeKwargs
|
|
23
24
|
|
|
24
25
|
logger = logging.getLogger(__name__)
|
|
25
26
|
|
mteb/models/vllm_wrapper.py
CHANGED
|
@@ -4,23 +4,25 @@ import atexit
|
|
|
4
4
|
import gc
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
|
-
from collections.abc import Callable
|
|
8
7
|
from typing import TYPE_CHECKING, Any, Literal
|
|
9
8
|
|
|
10
9
|
import numpy as np
|
|
11
10
|
import torch
|
|
12
|
-
from torch.utils.data import DataLoader
|
|
13
11
|
|
|
14
12
|
from mteb._requires_package import requires_package
|
|
15
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
13
|
from mteb.models import ModelMeta
|
|
17
14
|
from mteb.models.abs_encoder import AbsEncoder
|
|
18
|
-
from mteb.types import
|
|
15
|
+
from mteb.types import PromptType
|
|
19
16
|
|
|
20
17
|
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
|
|
20
|
+
from torch.utils.data import DataLoader
|
|
21
21
|
from vllm.config import PoolerConfig # type: ignore[import-not-found]
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
24
|
+
from mteb.types import Array, BatchedInput
|
|
25
|
+
|
|
24
26
|
|
|
25
27
|
logger = logging.getLogger(__name__)
|
|
26
28
|
|
|
@@ -4,34 +4,39 @@ import functools
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import warnings
|
|
7
|
-
from collections.abc import Callable, Iterable, Iterator
|
|
8
7
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Literal, cast
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
10
9
|
|
|
11
10
|
import pandas as pd
|
|
12
11
|
from packaging.version import InvalidVersion, Version
|
|
13
12
|
from pydantic import BaseModel, ConfigDict
|
|
14
|
-
from typing_extensions import Self
|
|
15
13
|
|
|
16
|
-
from mteb.abstasks.abstask import AbsTask
|
|
17
|
-
from mteb.abstasks.task_metadata import (
|
|
18
|
-
TaskDomain,
|
|
19
|
-
TaskType,
|
|
20
|
-
)
|
|
21
14
|
from mteb.benchmarks.benchmark import Benchmark
|
|
22
15
|
from mteb.models import ModelMeta
|
|
23
16
|
from mteb.models.get_model_meta import get_model_metas
|
|
24
|
-
from mteb.types import (
|
|
25
|
-
ISOLanguage,
|
|
26
|
-
ISOLanguageScript,
|
|
27
|
-
Modalities,
|
|
28
|
-
Score,
|
|
29
|
-
ScoresDict,
|
|
30
|
-
SplitName,
|
|
31
|
-
)
|
|
32
17
|
|
|
33
18
|
from .model_result import ModelResult, _aggregate_and_pivot
|
|
34
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
22
|
+
|
|
23
|
+
from typing_extensions import Self
|
|
24
|
+
|
|
25
|
+
from mteb.abstasks.abstask import AbsTask
|
|
26
|
+
from mteb.abstasks.task_metadata import (
|
|
27
|
+
TaskDomain,
|
|
28
|
+
TaskType,
|
|
29
|
+
)
|
|
30
|
+
from mteb.types import (
|
|
31
|
+
ISOLanguage,
|
|
32
|
+
ISOLanguageScript,
|
|
33
|
+
Modalities,
|
|
34
|
+
Score,
|
|
35
|
+
ScoresDict,
|
|
36
|
+
SplitName,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
35
40
|
logger = logging.getLogger(__name__)
|
|
36
41
|
|
|
37
42
|
|
|
@@ -144,7 +149,7 @@ class BenchmarkResults(BaseModel):
|
|
|
144
149
|
raise ValueError("name in ModelMeta is None. It must be a string.")
|
|
145
150
|
name_rev[name.name] = name.revision
|
|
146
151
|
else:
|
|
147
|
-
name_ = cast(str, name)
|
|
152
|
+
name_ = cast("str", name)
|
|
148
153
|
name_rev[name_] = revision
|
|
149
154
|
|
|
150
155
|
for model_res in self.model_results:
|
mteb/results/model_result.py
CHANGED
|
@@ -2,30 +2,36 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import warnings
|
|
5
|
-
from
|
|
6
|
-
from typing import Any, Literal, cast
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
7
6
|
|
|
8
7
|
import numpy as np
|
|
9
8
|
import pandas as pd
|
|
10
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
11
10
|
from typing_extensions import overload
|
|
12
11
|
|
|
13
|
-
from mteb.abstasks.abstask import AbsTask
|
|
14
|
-
from mteb.abstasks.task_metadata import (
|
|
15
|
-
TaskDomain,
|
|
16
|
-
TaskType,
|
|
17
|
-
)
|
|
18
12
|
from mteb.types import (
|
|
19
|
-
ISOLanguage,
|
|
20
|
-
ISOLanguageScript,
|
|
21
13
|
Modalities,
|
|
22
|
-
Score,
|
|
23
|
-
ScoresDict,
|
|
24
|
-
SplitName,
|
|
25
14
|
)
|
|
26
15
|
|
|
27
16
|
from .task_result import TaskError, TaskResult
|
|
28
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Callable, Iterable
|
|
20
|
+
|
|
21
|
+
from mteb.abstasks.abstask import AbsTask
|
|
22
|
+
from mteb.abstasks.task_metadata import (
|
|
23
|
+
TaskDomain,
|
|
24
|
+
TaskType,
|
|
25
|
+
)
|
|
26
|
+
from mteb.types import (
|
|
27
|
+
ISOLanguage,
|
|
28
|
+
ISOLanguageScript,
|
|
29
|
+
Score,
|
|
30
|
+
ScoresDict,
|
|
31
|
+
SplitName,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
29
35
|
logger = logging.getLogger(__name__)
|
|
30
36
|
|
|
31
37
|
|
|
@@ -83,7 +89,7 @@ class ModelResult(BaseModel):
|
|
|
83
89
|
model_revision: str | None
|
|
84
90
|
task_results: list[TaskResult]
|
|
85
91
|
default_modalities: list[Modalities] = Field(
|
|
86
|
-
default_factory=lambda: [cast(Modalities, "text")], alias="modalities"
|
|
92
|
+
default_factory=lambda: [cast("Modalities", "text")], alias="modalities"
|
|
87
93
|
)
|
|
88
94
|
model_config = (
|
|
89
95
|
ConfigDict( # to free up the name model_* which is otherwise protected
|
|
@@ -202,8 +208,8 @@ class ModelResult(BaseModel):
|
|
|
202
208
|
aggregation = aggregation if aggregation is not None else np.mean
|
|
203
209
|
else:
|
|
204
210
|
use_fast = True
|
|
205
|
-
aggregation = cast(Callable[[list[Score]], Any], aggregation)
|
|
206
|
-
getter = cast(Callable[[ScoresDict], Score], getter)
|
|
211
|
+
aggregation = cast("Callable[[list[Score]], Any]", aggregation)
|
|
212
|
+
getter = cast("Callable[[ScoresDict], Score]", getter)
|
|
207
213
|
|
|
208
214
|
if format == "wide":
|
|
209
215
|
scores = {}
|
mteb/results/task_result.py
CHANGED
|
@@ -4,34 +4,40 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
6
|
from collections import defaultdict
|
|
7
|
-
from collections.abc import Callable, Iterable, Mapping
|
|
8
7
|
from functools import cached_property
|
|
9
8
|
from importlib.metadata import version
|
|
10
|
-
from
|
|
11
|
-
from typing import Any
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
12
10
|
|
|
13
11
|
import numpy as np
|
|
14
12
|
from huggingface_hub import EvalResult
|
|
15
13
|
from packaging.version import Version
|
|
16
14
|
from pydantic import BaseModel, field_validator
|
|
17
|
-
from typing_extensions import Self
|
|
18
15
|
|
|
19
16
|
from mteb import TaskMetadata
|
|
20
17
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
21
18
|
from mteb.abstasks import AbsTaskClassification
|
|
22
19
|
from mteb.abstasks.abstask import AbsTask
|
|
23
|
-
from mteb.abstasks.task_metadata import TaskDomain
|
|
24
20
|
from mteb.languages import LanguageScripts
|
|
25
21
|
from mteb.models.model_meta import ScoringFunction
|
|
26
22
|
from mteb.types import (
|
|
27
|
-
HFSubset,
|
|
28
|
-
ISOLanguage,
|
|
29
|
-
ISOLanguageScript,
|
|
30
|
-
Score,
|
|
31
23
|
ScoresDict,
|
|
32
24
|
SplitName,
|
|
33
25
|
)
|
|
34
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from collections.abc import Callable, Iterable, Mapping
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
|
|
31
|
+
from typing_extensions import Self
|
|
32
|
+
|
|
33
|
+
from mteb.abstasks.task_metadata import TaskDomain
|
|
34
|
+
from mteb.types import (
|
|
35
|
+
HFSubset,
|
|
36
|
+
ISOLanguage,
|
|
37
|
+
ISOLanguageScript,
|
|
38
|
+
Score,
|
|
39
|
+
)
|
|
40
|
+
|
|
35
41
|
logger = logging.getLogger(__name__)
|
|
36
42
|
|
|
37
43
|
|
mteb/similarity_functions.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
import torch
|
|
2
6
|
|
|
3
|
-
from mteb.models import EncoderProtocol
|
|
4
7
|
from mteb.models.model_meta import ScoringFunction
|
|
5
|
-
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from mteb.models import EncoderProtocol
|
|
11
|
+
from mteb.types import Array
|
|
6
12
|
|
|
7
13
|
|
|
8
14
|
def _use_torch_compile():
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.retrieval import (
|
|
4
4
|
CQADupstackAndroidRetrieval,
|
|
5
5
|
CQADupstackEnglishRetrieval,
|
|
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval import (
|
|
|
15
15
|
CQADupstackWordpressRetrieval,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
task_list_cqa
|
|
18
|
+
task_list_cqa = [
|
|
19
19
|
CQADupstackAndroidRetrieval(),
|
|
20
20
|
CQADupstackEnglishRetrieval(),
|
|
21
21
|
CQADupstackGamingRetrieval(),
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from mteb.abstasks.
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.sts.multilingual.sts17_multilingual_visual_sts import (
|
|
4
4
|
STS17MultilingualVisualSTS,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
-
task_list_sts17
|
|
7
|
+
task_list_sts17 = [
|
|
8
8
|
STS17MultilingualVisualSTS().filter_languages(
|
|
9
9
|
languages=["eng"], hf_subsets=["en-en"]
|
|
10
10
|
)
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from mteb.abstasks.
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.sts.multilingual.sts_benchmark_multilingual_visual_sts import (
|
|
4
4
|
STSBenchmarkMultilingualVisualSTS,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
-
task_list_stsb
|
|
7
|
+
task_list_stsb = [
|
|
8
8
|
STSBenchmarkMultilingualVisualSTS().filter_languages(
|
|
9
9
|
languages=["eng"], hf_subsets=["en"]
|
|
10
10
|
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.retrieval import (
|
|
4
4
|
CQADupstackAndroidRetrievalFa,
|
|
5
5
|
CQADupstackEnglishRetrievalFa,
|
|
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval import (
|
|
|
15
15
|
CQADupstackWordpressRetrievalFa,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
task_list_cqa
|
|
18
|
+
task_list_cqa = [
|
|
19
19
|
CQADupstackAndroidRetrievalFa(),
|
|
20
20
|
CQADupstackEnglishRetrievalFa(),
|
|
21
21
|
CQADupstackGamingRetrievalFa(),
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.classification import (
|
|
4
4
|
SynPerChatbotConvSAAnger,
|
|
5
5
|
SynPerChatbotConvSAFear,
|
|
@@ -12,7 +12,7 @@ from mteb.tasks.classification import (
|
|
|
12
12
|
SynPerChatbotConvSASurprise,
|
|
13
13
|
)
|
|
14
14
|
|
|
15
|
-
task_list_cqa
|
|
15
|
+
task_list_cqa = [
|
|
16
16
|
SynPerChatbotConvSAAnger(),
|
|
17
17
|
SynPerChatbotConvSASatisfaction(),
|
|
18
18
|
SynPerChatbotConvSAFriendship(),
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from mteb.abstasks.
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.sts.multilingual.sts17_multilingual_visual_sts import (
|
|
4
4
|
STS17MultilingualVisualSTS,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
-
task_list_sts17_multi
|
|
7
|
+
task_list_sts17_multi = [
|
|
8
8
|
STS17MultilingualVisualSTS().filter_languages(
|
|
9
9
|
languages=["ara", "eng", "spa", "kor"],
|
|
10
10
|
hf_subsets=[
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from mteb.abstasks.
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.sts.multilingual.sts_benchmark_multilingual_visual_sts import (
|
|
4
4
|
STSBenchmarkMultilingualVisualSTS,
|
|
5
5
|
)
|
|
6
6
|
|
|
7
|
-
task_list_multi
|
|
7
|
+
task_list_multi = [
|
|
8
8
|
STSBenchmarkMultilingualVisualSTS().filter_languages(
|
|
9
9
|
languages=[
|
|
10
10
|
"deu",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.retrieval import (
|
|
4
4
|
CQADupstackAndroidNLRetrieval,
|
|
5
5
|
CQADupstackEnglishNLRetrieval,
|
|
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval import (
|
|
|
15
15
|
CQADupstackWordpressNLRetrieval,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
task_list_cqa
|
|
18
|
+
task_list_cqa = [
|
|
19
19
|
CQADupstackAndroidNLRetrieval(),
|
|
20
20
|
CQADupstackEnglishNLRetrieval(),
|
|
21
21
|
CQADupstackGamingNLRetrieval(),
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from mteb.abstasks import
|
|
2
|
-
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
1
|
+
from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata
|
|
2
|
+
from mteb.abstasks.aggregated_task import AbsTaskAggregate
|
|
3
3
|
from mteb.tasks.retrieval.pol.cqadupstack_pl_retrieval import (
|
|
4
4
|
CQADupstackAndroidRetrievalPL,
|
|
5
5
|
CQADupstackEnglishRetrievalPL,
|
|
@@ -15,7 +15,7 @@ from mteb.tasks.retrieval.pol.cqadupstack_pl_retrieval import (
|
|
|
15
15
|
CQADupstackWordpressRetrievalPL,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
task_list_cqa
|
|
18
|
+
task_list_cqa = [
|
|
19
19
|
CQADupstackAndroidRetrievalPL(),
|
|
20
20
|
CQADupstackEnglishRetrievalPL(),
|
|
21
21
|
CQADupstackGamingRetrievalPL(),
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import random
|
|
2
|
-
from collections.abc import Iterable
|
|
3
4
|
from itertools import islice
|
|
4
|
-
from typing import TypeVar
|
|
5
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
5
6
|
|
|
6
7
|
import datasets
|
|
7
8
|
|
|
8
9
|
from mteb.abstasks.clustering_legacy import AbsTaskClusteringLegacy
|
|
9
10
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Iterable
|
|
14
|
+
|
|
15
|
+
|
|
11
16
|
T = TypeVar("T")
|
|
12
17
|
|
|
13
18
|
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import random
|
|
2
|
-
from collections.abc import Iterable
|
|
3
4
|
from itertools import islice
|
|
4
|
-
from typing import TypeVar
|
|
5
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
5
6
|
|
|
6
7
|
import datasets
|
|
7
8
|
|
|
8
9
|
from mteb.abstasks.clustering_legacy import AbsTaskClusteringLegacy
|
|
9
10
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Iterable
|
|
14
|
+
|
|
15
|
+
|
|
11
16
|
T = TypeVar("T")
|
|
12
17
|
|
|
13
18
|
|
|
@@ -14,6 +14,28 @@ from .birco_whats_that_book_reranking import BIRCOWhatsThatBookReranking
|
|
|
14
14
|
from .blink_it2i_retrieval import BLINKIT2IRetrieval
|
|
15
15
|
from .blink_it2t_retrieval import BLINKIT2TRetrieval
|
|
16
16
|
from .bright_retrieval import BrightLongRetrieval, BrightRetrieval
|
|
17
|
+
from .bright_v1_1_retrieval import (
|
|
18
|
+
BrightAopsRetrieval,
|
|
19
|
+
BrightBiologyLongRetrieval,
|
|
20
|
+
BrightBiologyRetrieval,
|
|
21
|
+
BrightEarthScienceLongRetrieval,
|
|
22
|
+
BrightEarthScienceRetrieval,
|
|
23
|
+
BrightEconomicsLongRetrieval,
|
|
24
|
+
BrightEconomicsRetrieval,
|
|
25
|
+
BrightLeetcodeRetrieval,
|
|
26
|
+
BrightPonyLongRetrieval,
|
|
27
|
+
BrightPonyRetrieval,
|
|
28
|
+
BrightPsychologyLongRetrieval,
|
|
29
|
+
BrightPsychologyRetrieval,
|
|
30
|
+
BrightRoboticsLongRetrieval,
|
|
31
|
+
BrightRoboticsRetrieval,
|
|
32
|
+
BrightStackoverflowLongRetrieval,
|
|
33
|
+
BrightStackoverflowRetrieval,
|
|
34
|
+
BrightSustainableLivingLongRetrieval,
|
|
35
|
+
BrightSustainableLivingRetrieval,
|
|
36
|
+
BrightTheoremQAQuestionsRetrieval,
|
|
37
|
+
BrightTheoremQATheoremsRetrieval,
|
|
38
|
+
)
|
|
17
39
|
from .built_bench_retrieval import BuiltBenchRetrieval
|
|
18
40
|
from .chat_doctor_retrieval import ChatDoctorRetrieval
|
|
19
41
|
from .chem_hotpot_qa_retrieval import ChemHotpotQARetrieval
|
|
@@ -236,8 +258,28 @@ __all__ = [
|
|
|
236
258
|
"BarExamQARetrieval",
|
|
237
259
|
"BillSumCARetrieval",
|
|
238
260
|
"BillSumUSRetrieval",
|
|
261
|
+
"BrightAopsRetrieval",
|
|
262
|
+
"BrightBiologyLongRetrieval",
|
|
263
|
+
"BrightBiologyRetrieval",
|
|
264
|
+
"BrightEarthScienceLongRetrieval",
|
|
265
|
+
"BrightEarthScienceRetrieval",
|
|
266
|
+
"BrightEconomicsLongRetrieval",
|
|
267
|
+
"BrightEconomicsRetrieval",
|
|
268
|
+
"BrightLeetcodeRetrieval",
|
|
239
269
|
"BrightLongRetrieval",
|
|
270
|
+
"BrightPonyLongRetrieval",
|
|
271
|
+
"BrightPonyRetrieval",
|
|
272
|
+
"BrightPsychologyLongRetrieval",
|
|
273
|
+
"BrightPsychologyRetrieval",
|
|
240
274
|
"BrightRetrieval",
|
|
275
|
+
"BrightRoboticsLongRetrieval",
|
|
276
|
+
"BrightRoboticsRetrieval",
|
|
277
|
+
"BrightStackoverflowLongRetrieval",
|
|
278
|
+
"BrightStackoverflowRetrieval",
|
|
279
|
+
"BrightSustainableLivingLongRetrieval",
|
|
280
|
+
"BrightSustainableLivingRetrieval",
|
|
281
|
+
"BrightTheoremQAQuestionsRetrieval",
|
|
282
|
+
"BrightTheoremQATheoremsRetrieval",
|
|
241
283
|
"BuiltBenchRetrieval",
|
|
242
284
|
"CIRRIT2IRetrieval",
|
|
243
285
|
"CQADupstackAndroidRetrieval",
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from collections import defaultdict
|
|
2
3
|
|
|
3
4
|
import datasets
|
|
@@ -86,6 +87,12 @@ def load_data(self) -> None:
|
|
|
86
87
|
if self.data_loaded:
|
|
87
88
|
return
|
|
88
89
|
|
|
90
|
+
warnings.warn(
|
|
91
|
+
"This task contains wrong prompts in the metadata. "
|
|
92
|
+
"Please use BRIGHT(v1.1) benchmark instead.",
|
|
93
|
+
category=DeprecationWarning,
|
|
94
|
+
)
|
|
95
|
+
|
|
89
96
|
self.corpus, self.queries, self.relevant_docs = self.load_bright_data(
|
|
90
97
|
path=self.metadata.dataset["path"],
|
|
91
98
|
domains=list(self.metadata.eval_langs.keys()),
|
|
@@ -104,7 +111,7 @@ class BrightRetrieval(AbsTaskRetrieval):
|
|
|
104
111
|
"revision": "a75a0eb483f6a5233a6efc2d63d71540a4443dfb",
|
|
105
112
|
},
|
|
106
113
|
reference="https://huggingface.co/datasets/xlangai/BRIGHT",
|
|
107
|
-
description="
|
|
114
|
+
description="BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval",
|
|
108
115
|
type="Retrieval",
|
|
109
116
|
category="t2t",
|
|
110
117
|
eval_splits=["standard"],
|
|
@@ -129,6 +136,7 @@ class BrightRetrieval(AbsTaskRetrieval):
|
|
|
129
136
|
year = {2024},
|
|
130
137
|
}
|
|
131
138
|
""",
|
|
139
|
+
superseded_by="BrightBiologyRetrieval",
|
|
132
140
|
)
|
|
133
141
|
load_bright_data = load_bright_data
|
|
134
142
|
load_data = load_data
|