mteb 2.7.2__py3-none-any.whl → 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +16 -9
- mteb/_evaluators/any_sts_evaluator.py +10 -5
- mteb/_evaluators/clustering_evaluator.py +10 -4
- mteb/_evaluators/evaluator.py +9 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
- mteb/_evaluators/pair_classification_evaluator.py +10 -5
- mteb/_evaluators/retrieval_evaluator.py +19 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +14 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
- mteb/_evaluators/text/summarization_evaluator.py +8 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +17 -9
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +15 -6
- mteb/abstasks/clustering.py +17 -8
- mteb/abstasks/clustering_legacy.py +14 -6
- mteb/abstasks/image/image_text_pair_classification.py +17 -7
- mteb/abstasks/multilabel_classification.py +11 -5
- mteb/abstasks/pair_classification.py +19 -9
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +27 -16
- mteb/abstasks/retrieval_dataset_loaders.py +11 -8
- mteb/abstasks/sts.py +19 -10
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +14 -7
- mteb/abstasks/text/summarization.py +17 -7
- mteb/abstasks/zeroshot_classification.py +15 -7
- mteb/benchmarks/_create_table.py +13 -3
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +10 -5
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/evaluate.py +20 -18
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +7 -3
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +11 -4
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +9 -4
- mteb/models/model_implementations/bedrock_models.py +16 -6
- mteb/models/model_implementations/blip2_models.py +9 -4
- mteb/models/model_implementations/blip_models.py +9 -4
- mteb/models/model_implementations/bm25.py +15 -10
- mteb/models/model_implementations/bmretriever_models.py +6 -2
- mteb/models/model_implementations/cde_models.py +9 -5
- mteb/models/model_implementations/clip_models.py +9 -4
- mteb/models/model_implementations/cohere_models.py +10 -4
- mteb/models/model_implementations/cohere_v.py +9 -4
- mteb/models/model_implementations/colpali_models.py +4 -3
- mteb/models/model_implementations/colqwen_models.py +10 -31
- mteb/models/model_implementations/colsmol_models.py +1 -1
- mteb/models/model_implementations/conan_models.py +10 -4
- mteb/models/model_implementations/dino_models.py +9 -4
- mteb/models/model_implementations/e5_v.py +9 -4
- mteb/models/model_implementations/eagerworks_models.py +10 -4
- mteb/models/model_implementations/evaclip_models.py +9 -4
- mteb/models/model_implementations/gme_v_models.py +5 -3
- mteb/models/model_implementations/google_models.py +10 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +6 -5
- mteb/models/model_implementations/hinvec_models.py +5 -1
- mteb/models/model_implementations/jasper_models.py +12 -5
- mteb/models/model_implementations/jina_clip.py +9 -4
- mteb/models/model_implementations/jina_models.py +10 -5
- mteb/models/model_implementations/kalm_models.py +18 -12
- mteb/models/model_implementations/linq_models.py +6 -1
- mteb/models/model_implementations/listconranker.py +9 -4
- mteb/models/model_implementations/llm2clip_models.py +9 -4
- mteb/models/model_implementations/llm2vec_models.py +12 -6
- mteb/models/model_implementations/mcinext_models.py +5 -2
- mteb/models/model_implementations/moco_models.py +9 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +10 -4
- mteb/models/model_implementations/no_instruct_sentence_models.py +12 -5
- mteb/models/model_implementations/nomic_models.py +10 -4
- mteb/models/model_implementations/nomic_models_vision.py +4 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +7 -3
- mteb/models/model_implementations/nvidia_models.py +12 -4
- mteb/models/model_implementations/octen_models.py +1 -1
- mteb/models/model_implementations/openai_models.py +9 -4
- mteb/models/model_implementations/openclip_models.py +9 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -4
- mteb/models/model_implementations/ops_moa_models.py +7 -2
- mteb/models/model_implementations/promptriever_models.py +12 -6
- mteb/models/model_implementations/pylate_models.py +19 -13
- mteb/models/model_implementations/qwen3_models.py +8 -1
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/repllama_models.py +13 -6
- mteb/models/model_implementations/rerankers_custom.py +10 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +10 -4
- mteb/models/model_implementations/salesforce_models.py +7 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +4 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +5 -2
- mteb/models/model_implementations/seed_models.py +1 -1
- mteb/models/model_implementations/siglip_models.py +9 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/uae_models.py +9 -4
- mteb/models/model_implementations/vdr_models.py +7 -1
- mteb/models/model_implementations/vista_models.py +9 -4
- mteb/models/model_implementations/vlm2vec_models.py +9 -4
- mteb/models/model_implementations/voyage_models.py +10 -4
- mteb/models/model_implementations/voyage_v.py +10 -6
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +12 -7
- mteb/models/models_protocols.py +19 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +19 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +15 -9
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/clustering/nob/snl_clustering.py +7 -2
- mteb/tasks/clustering/nob/vg_clustering.py +7 -2
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/METADATA +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/RECORD +151 -151
- {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/WHEEL +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/top_level.txt +0 -0
mteb/abstasks/clustering.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import itertools
|
|
2
4
|
import logging
|
|
3
5
|
import random
|
|
4
6
|
from collections import defaultdict
|
|
5
|
-
from
|
|
6
|
-
from typing import Any, cast
|
|
7
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
from datasets import Dataset, DatasetDict
|
|
@@ -11,13 +12,10 @@ from sklearn.cluster import MiniBatchKMeans
|
|
|
11
12
|
from sklearn.metrics.cluster import v_measure_score
|
|
12
13
|
|
|
13
14
|
from mteb._create_dataloaders import create_dataloader
|
|
14
|
-
from mteb.models import EncoderProtocol
|
|
15
|
-
from mteb.types import Array,
|
|
15
|
+
from mteb.models import EncoderProtocol
|
|
16
|
+
from mteb.types import Array, HFSubset
|
|
16
17
|
from mteb.types.statistics import (
|
|
17
|
-
ImageStatistics,
|
|
18
|
-
LabelStatistics,
|
|
19
18
|
SplitDescriptiveStatistics,
|
|
20
|
-
TextStatistics,
|
|
21
19
|
)
|
|
22
20
|
|
|
23
21
|
from ._statistics_calculation import (
|
|
@@ -27,6 +25,17 @@ from ._statistics_calculation import (
|
|
|
27
25
|
)
|
|
28
26
|
from .abstask import AbsTask
|
|
29
27
|
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
|
|
31
|
+
from mteb.models import MTEBModels
|
|
32
|
+
from mteb.types import Array, EncodeKwargs, ScoresDict
|
|
33
|
+
from mteb.types.statistics import (
|
|
34
|
+
ImageStatistics,
|
|
35
|
+
LabelStatistics,
|
|
36
|
+
TextStatistics,
|
|
37
|
+
)
|
|
38
|
+
|
|
30
39
|
logger = logging.getLogger(__name__)
|
|
31
40
|
|
|
32
41
|
|
|
@@ -186,7 +195,7 @@ class AbsTaskClustering(AbsTask):
|
|
|
186
195
|
self.max_fraction_of_documents_to_embed * len(data_split)
|
|
187
196
|
)
|
|
188
197
|
else:
|
|
189
|
-
max_documents_to_embed = cast(int, self.max_document_to_embed)
|
|
198
|
+
max_documents_to_embed = cast("int", self.max_document_to_embed)
|
|
190
199
|
|
|
191
200
|
max_documents_to_embed = min(len(data_split), max_documents_to_embed)
|
|
192
201
|
example_indices = self.rng_state.sample(
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any, TypedDict
|
|
4
|
+
from typing import TYPE_CHECKING, Any, TypedDict
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
from datasets import Dataset
|
|
@@ -9,12 +10,8 @@ from sklearn import metrics
|
|
|
9
10
|
|
|
10
11
|
from mteb._evaluators import ClusteringEvaluator
|
|
11
12
|
from mteb.models import EncoderProtocol, MTEBModels
|
|
12
|
-
from mteb.types import EncodeKwargs, ScoresDict
|
|
13
13
|
from mteb.types.statistics import (
|
|
14
|
-
ImageStatistics,
|
|
15
|
-
LabelStatistics,
|
|
16
14
|
SplitDescriptiveStatistics,
|
|
17
|
-
TextStatistics,
|
|
18
15
|
)
|
|
19
16
|
|
|
20
17
|
from ._statistics_calculation import (
|
|
@@ -24,6 +21,17 @@ from ._statistics_calculation import (
|
|
|
24
21
|
)
|
|
25
22
|
from .abstask import AbsTask
|
|
26
23
|
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
from mteb.models import MTEBModels
|
|
28
|
+
from mteb.types import EncodeKwargs, ScoresDict
|
|
29
|
+
from mteb.types.statistics import (
|
|
30
|
+
ImageStatistics,
|
|
31
|
+
LabelStatistics,
|
|
32
|
+
TextStatistics,
|
|
33
|
+
)
|
|
34
|
+
|
|
27
35
|
logger = logging.getLogger(__name__)
|
|
28
36
|
|
|
29
37
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from collections.abc import Sequence
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, TypedDict
|
|
5
|
+
from typing import TYPE_CHECKING, Any, TypedDict
|
|
5
6
|
|
|
6
7
|
import torch
|
|
7
|
-
from datasets import
|
|
8
|
+
from datasets import concatenate_datasets
|
|
8
9
|
|
|
9
10
|
from mteb._evaluators import ImageTextPairClassificationEvaluator
|
|
10
11
|
from mteb.abstasks._statistics_calculation import (
|
|
@@ -12,14 +13,23 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
12
13
|
calculate_text_statistics,
|
|
13
14
|
)
|
|
14
15
|
from mteb.abstasks.abstask import AbsTask
|
|
15
|
-
from mteb.models.models_protocols import EncoderProtocol
|
|
16
|
-
from mteb.types import EncodeKwargs
|
|
16
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
17
17
|
from mteb.types.statistics import (
|
|
18
|
-
ImageStatistics,
|
|
19
18
|
SplitDescriptiveStatistics,
|
|
20
|
-
TextStatistics,
|
|
21
19
|
)
|
|
22
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from datasets import Dataset
|
|
25
|
+
|
|
26
|
+
from mteb.models.models_protocols import MTEBModels
|
|
27
|
+
from mteb.types import EncodeKwargs
|
|
28
|
+
from mteb.types.statistics import (
|
|
29
|
+
ImageStatistics,
|
|
30
|
+
TextStatistics,
|
|
31
|
+
)
|
|
32
|
+
|
|
23
33
|
logger = logging.getLogger(__name__)
|
|
24
34
|
|
|
25
35
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import itertools
|
|
2
4
|
import logging
|
|
3
5
|
from collections import defaultdict
|
|
4
|
-
from
|
|
5
|
-
from typing import Any, TypedDict
|
|
6
|
+
from typing import TYPE_CHECKING, Any, TypedDict
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
8
9
|
from datasets import DatasetDict
|
|
@@ -15,12 +16,17 @@ from typing_extensions import override
|
|
|
15
16
|
|
|
16
17
|
from mteb._create_dataloaders import create_dataloader
|
|
17
18
|
from mteb._evaluators.classification_metrics import hamming_score
|
|
18
|
-
from mteb.
|
|
19
|
-
from mteb.models import EncoderProtocol, MTEBModels
|
|
20
|
-
from mteb.types import Array, EncodeKwargs
|
|
19
|
+
from mteb.models import EncoderProtocol
|
|
21
20
|
|
|
22
21
|
from .classification import AbsTaskClassification
|
|
23
22
|
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
|
|
27
|
+
from mteb.models import MTEBModels
|
|
28
|
+
from mteb.types import Array, EncodeKwargs
|
|
29
|
+
|
|
24
30
|
logger = logging.getLogger(__name__)
|
|
25
31
|
|
|
26
32
|
|
|
@@ -1,16 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
4
|
import logging
|
|
3
5
|
from collections import defaultdict
|
|
4
|
-
from
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
5
7
|
|
|
6
8
|
import numpy as np
|
|
7
9
|
from datasets import Dataset
|
|
8
10
|
from sklearn.metrics import average_precision_score
|
|
9
11
|
|
|
10
12
|
from mteb._evaluators import PairClassificationEvaluator
|
|
11
|
-
from mteb._evaluators.pair_classification_evaluator import (
|
|
12
|
-
PairClassificationDistances,
|
|
13
|
-
)
|
|
14
13
|
from mteb.abstasks._statistics_calculation import (
|
|
15
14
|
calculate_image_statistics,
|
|
16
15
|
calculate_label_statistics,
|
|
@@ -18,15 +17,26 @@ from mteb.abstasks._statistics_calculation import (
|
|
|
18
17
|
)
|
|
19
18
|
from mteb.abstasks.abstask import AbsTask
|
|
20
19
|
from mteb.models.model_meta import ScoringFunction
|
|
21
|
-
from mteb.models.models_protocols import EncoderProtocol
|
|
22
|
-
from mteb.types import EncodeKwargs, PromptType
|
|
20
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
23
21
|
from mteb.types.statistics import (
|
|
24
|
-
ImageStatistics,
|
|
25
|
-
LabelStatistics,
|
|
26
22
|
SplitDescriptiveStatistics,
|
|
27
|
-
TextStatistics,
|
|
28
23
|
)
|
|
29
24
|
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
from mteb._evaluators.pair_classification_evaluator import (
|
|
29
|
+
PairClassificationDistances,
|
|
30
|
+
)
|
|
31
|
+
from mteb.models.models_protocols import MTEBModels
|
|
32
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
33
|
+
from mteb.types.statistics import (
|
|
34
|
+
ImageStatistics,
|
|
35
|
+
LabelStatistics,
|
|
36
|
+
TextStatistics,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
30
40
|
logger = logging.getLogger(__name__)
|
|
31
41
|
|
|
32
42
|
|
mteb/abstasks/regression.py
CHANGED
|
@@ -1,29 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import TypedDict
|
|
4
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
3
5
|
|
|
4
6
|
import datasets
|
|
5
7
|
import numpy as np
|
|
6
8
|
import pandas as pd
|
|
7
|
-
from datasets import Dataset
|
|
8
9
|
from scipy.stats import kendalltau
|
|
9
10
|
from sklearn.linear_model import LinearRegression
|
|
10
11
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
11
12
|
|
|
12
|
-
from mteb._evaluators.sklearn_evaluator import SklearnEvaluator
|
|
13
|
+
from mteb._evaluators.sklearn_evaluator import SklearnEvaluator
|
|
13
14
|
from mteb.abstasks._statistics_calculation import (
|
|
14
15
|
calculate_image_statistics,
|
|
15
16
|
calculate_score_statistics,
|
|
16
17
|
calculate_text_statistics,
|
|
17
18
|
)
|
|
18
19
|
from mteb.types.statistics import (
|
|
19
|
-
ImageStatistics,
|
|
20
|
-
ScoreStatistics,
|
|
21
20
|
SplitDescriptiveStatistics,
|
|
22
|
-
TextStatistics,
|
|
23
21
|
)
|
|
24
22
|
|
|
25
23
|
from .classification import AbsTaskClassification
|
|
26
24
|
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from datasets import Dataset
|
|
27
|
+
|
|
28
|
+
from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
|
|
29
|
+
from mteb.types.statistics import (
|
|
30
|
+
ImageStatistics,
|
|
31
|
+
ScoreStatistics,
|
|
32
|
+
TextStatistics,
|
|
33
|
+
)
|
|
34
|
+
|
|
27
35
|
logger = logging.getLogger(__name__)
|
|
28
36
|
|
|
29
37
|
|
mteb/abstasks/retrieval.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
from collections import defaultdict
|
|
4
|
-
from collections.abc import Callable, Mapping, Sequence
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from time import time
|
|
7
|
-
from typing import Any, Literal
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
8
9
|
|
|
9
10
|
from datasets import Dataset, DatasetDict, concatenate_datasets
|
|
10
|
-
from typing_extensions import Self
|
|
11
11
|
|
|
12
12
|
from mteb._create_dataloaders import (
|
|
13
13
|
_combine_queries_with_instruction_text,
|
|
@@ -19,25 +19,12 @@ from mteb._evaluators.retrieval_metrics import make_score_dict
|
|
|
19
19
|
from mteb.models import (
|
|
20
20
|
CrossEncoderProtocol,
|
|
21
21
|
EncoderProtocol,
|
|
22
|
-
MTEBModels,
|
|
23
22
|
SearchCrossEncoderWrapper,
|
|
24
23
|
SearchEncoderWrapper,
|
|
25
24
|
SearchProtocol,
|
|
26
25
|
)
|
|
27
|
-
from mteb.types import (
|
|
28
|
-
EncodeKwargs,
|
|
29
|
-
HFSubset,
|
|
30
|
-
QueryDatasetType,
|
|
31
|
-
RelevantDocumentsType,
|
|
32
|
-
RetrievalOutputType,
|
|
33
|
-
ScoresDict,
|
|
34
|
-
)
|
|
35
26
|
from mteb.types.statistics import (
|
|
36
|
-
ImageStatistics,
|
|
37
|
-
RelevantDocsStatistics,
|
|
38
27
|
SplitDescriptiveStatistics,
|
|
39
|
-
TextStatistics,
|
|
40
|
-
TopRankedStatistics,
|
|
41
28
|
)
|
|
42
29
|
|
|
43
30
|
from ._statistics_calculation import (
|
|
@@ -53,6 +40,30 @@ from .retrieval_dataset_loaders import (
|
|
|
53
40
|
_combine_queries_with_instructions_datasets,
|
|
54
41
|
)
|
|
55
42
|
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
45
|
+
|
|
46
|
+
from typing_extensions import Self
|
|
47
|
+
|
|
48
|
+
from mteb.models import (
|
|
49
|
+
MTEBModels,
|
|
50
|
+
)
|
|
51
|
+
from mteb.types import (
|
|
52
|
+
EncodeKwargs,
|
|
53
|
+
HFSubset,
|
|
54
|
+
QueryDatasetType,
|
|
55
|
+
RelevantDocumentsType,
|
|
56
|
+
RetrievalOutputType,
|
|
57
|
+
ScoresDict,
|
|
58
|
+
)
|
|
59
|
+
from mteb.types.statistics import (
|
|
60
|
+
ImageStatistics,
|
|
61
|
+
RelevantDocsStatistics,
|
|
62
|
+
TextStatistics,
|
|
63
|
+
TopRankedStatistics,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
56
67
|
logger = logging.getLogger(__name__)
|
|
57
68
|
|
|
58
69
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import TypedDict
|
|
4
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
3
5
|
|
|
4
6
|
from datasets import (
|
|
5
7
|
Dataset,
|
|
@@ -11,13 +13,14 @@ from datasets import (
|
|
|
11
13
|
load_dataset,
|
|
12
14
|
)
|
|
13
15
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from mteb.types import (
|
|
18
|
+
CorpusDatasetType,
|
|
19
|
+
InstructionDatasetType,
|
|
20
|
+
QueryDatasetType,
|
|
21
|
+
RelevantDocumentsType,
|
|
22
|
+
TopRankedDocumentsType,
|
|
23
|
+
)
|
|
21
24
|
|
|
22
25
|
logger = logging.getLogger(__name__)
|
|
23
26
|
|
mteb/abstasks/sts.py
CHANGED
|
@@ -1,19 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any, TypedDict, cast
|
|
4
|
+
from typing import TYPE_CHECKING, Any, TypedDict, cast
|
|
4
5
|
|
|
5
|
-
from datasets import Dataset
|
|
6
6
|
from scipy.stats import pearsonr, spearmanr
|
|
7
7
|
|
|
8
8
|
from mteb._evaluators import AnySTSEvaluator
|
|
9
|
-
from mteb.
|
|
10
|
-
from mteb.models import EncoderProtocol, MTEBModels
|
|
11
|
-
from mteb.types import EncodeKwargs, PromptType
|
|
9
|
+
from mteb.models import EncoderProtocol
|
|
12
10
|
from mteb.types.statistics import (
|
|
13
|
-
ImageStatistics,
|
|
14
|
-
ScoreStatistics,
|
|
15
11
|
SplitDescriptiveStatistics,
|
|
16
|
-
TextStatistics,
|
|
17
12
|
)
|
|
18
13
|
|
|
19
14
|
from ._statistics_calculation import (
|
|
@@ -23,6 +18,20 @@ from ._statistics_calculation import (
|
|
|
23
18
|
)
|
|
24
19
|
from .abstask import AbsTask
|
|
25
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from datasets import Dataset
|
|
25
|
+
|
|
26
|
+
from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
|
|
27
|
+
from mteb.models import MTEBModels
|
|
28
|
+
from mteb.types import EncodeKwargs, PromptType
|
|
29
|
+
from mteb.types.statistics import (
|
|
30
|
+
ImageStatistics,
|
|
31
|
+
ScoreStatistics,
|
|
32
|
+
TextStatistics,
|
|
33
|
+
)
|
|
34
|
+
|
|
26
35
|
logger = logging.getLogger(__name__)
|
|
27
36
|
|
|
28
37
|
|
|
@@ -182,7 +191,7 @@ class AbsTaskSTS(AbsTask):
|
|
|
182
191
|
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
|
|
183
192
|
) -> AnySTSDescriptiveStatistics:
|
|
184
193
|
first_column, second_column = self.column_names
|
|
185
|
-
self.dataset = cast(dict[str, dict[str, Dataset]], self.dataset)
|
|
194
|
+
self.dataset = cast("dict[str, dict[str, Dataset]]", self.dataset)
|
|
186
195
|
|
|
187
196
|
if hf_subset:
|
|
188
197
|
sentence1 = self.dataset[hf_subset][split][first_column]
|
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
from collections.abc import Sequence
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Literal, cast
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
6
8
|
|
|
7
9
|
from huggingface_hub import (
|
|
8
|
-
CardData,
|
|
9
10
|
DatasetCard,
|
|
10
11
|
DatasetCardData,
|
|
11
12
|
constants,
|
|
@@ -17,13 +18,11 @@ from pydantic import (
|
|
|
17
18
|
ConfigDict,
|
|
18
19
|
field_validator,
|
|
19
20
|
)
|
|
20
|
-
from typing_extensions import Required, TypedDict
|
|
21
|
+
from typing_extensions import Required, TypedDict # noqa: TC002
|
|
21
22
|
|
|
22
23
|
import mteb
|
|
23
24
|
from mteb.languages import check_language_code
|
|
24
25
|
from mteb.types import (
|
|
25
|
-
HFSubset,
|
|
26
|
-
ISOLanguageScript,
|
|
27
26
|
Languages,
|
|
28
27
|
Licenses,
|
|
29
28
|
Modalities,
|
|
@@ -31,7 +30,17 @@ from mteb.types import (
|
|
|
31
30
|
StrDate,
|
|
32
31
|
StrURL,
|
|
33
32
|
)
|
|
34
|
-
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from huggingface_hub import (
|
|
36
|
+
CardData,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
from mteb.types import (
|
|
40
|
+
HFSubset,
|
|
41
|
+
ISOLanguageScript,
|
|
42
|
+
)
|
|
43
|
+
from mteb.types.statistics import DescriptiveStatistics
|
|
35
44
|
|
|
36
45
|
logger = logging.getLogger(__name__)
|
|
37
46
|
|
|
@@ -368,7 +377,7 @@ class TaskMetadata(BaseModel):
|
|
|
368
377
|
"""Return a dictionary mapping huggingface subsets to languages."""
|
|
369
378
|
if isinstance(self.eval_langs, dict):
|
|
370
379
|
return self.eval_langs
|
|
371
|
-
return {"default": cast(list[str], self.eval_langs)}
|
|
380
|
+
return {"default": cast("list[str]", self.eval_langs)}
|
|
372
381
|
|
|
373
382
|
@property
|
|
374
383
|
def intext_citation(self, include_cite: bool = True) -> str:
|
|
@@ -697,7 +706,7 @@ class TaskMetadata(BaseModel):
|
|
|
697
706
|
for val in self.eval_langs.values():
|
|
698
707
|
languages.extend(val)
|
|
699
708
|
else:
|
|
700
|
-
languages = cast(list[str], self.eval_langs)
|
|
709
|
+
languages = cast("list[str]", self.eval_langs)
|
|
701
710
|
# value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters),
|
|
702
711
|
# or a special value like "code", "multilingual".
|
|
703
712
|
readme_langs = []
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from collections import defaultdict
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, ClassVar, TypedDict, cast
|
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast
|
|
5
6
|
|
|
6
7
|
from datasets import Dataset, DatasetDict
|
|
7
8
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
@@ -9,9 +10,15 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_sc
|
|
|
9
10
|
from mteb._evaluators import BitextMiningEvaluator
|
|
10
11
|
from mteb.abstasks._statistics_calculation import calculate_text_statistics
|
|
11
12
|
from mteb.abstasks.abstask import AbsTask
|
|
12
|
-
from mteb.models import EncoderProtocol
|
|
13
|
-
from mteb.types import
|
|
14
|
-
|
|
13
|
+
from mteb.models import EncoderProtocol
|
|
14
|
+
from mteb.types.statistics import SplitDescriptiveStatistics
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from mteb.models import MTEBModels
|
|
20
|
+
from mteb.types import EncodeKwargs, HFSubset, ScoresDict
|
|
21
|
+
from mteb.types.statistics import TextStatistics
|
|
15
22
|
|
|
16
23
|
logger = logging.getLogger(__name__)
|
|
17
24
|
|
|
@@ -90,7 +97,7 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
90
97
|
if subsets_to_run is not None:
|
|
91
98
|
hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
|
|
92
99
|
|
|
93
|
-
encoder_model = cast(EncoderProtocol, model)
|
|
100
|
+
encoder_model = cast("EncoderProtocol", model)
|
|
94
101
|
|
|
95
102
|
if self.dataset is None:
|
|
96
103
|
raise ValueError("Dataset is not loaded.")
|
|
@@ -127,7 +134,7 @@ class AbsTaskBitextMining(AbsTask):
|
|
|
127
134
|
**kwargs,
|
|
128
135
|
)
|
|
129
136
|
|
|
130
|
-
return cast(dict[HFSubset, ScoresDict], scores)
|
|
137
|
+
return cast("dict[HFSubset, ScoresDict]", scores)
|
|
131
138
|
|
|
132
139
|
def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]:
|
|
133
140
|
pairs = self._DEFAULT_PAIR
|
|
@@ -1,24 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
3
5
|
|
|
4
6
|
import numpy as np
|
|
5
|
-
from datasets import Dataset
|
|
6
7
|
|
|
7
8
|
from mteb._evaluators import SummarizationEvaluator
|
|
8
|
-
from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
|
|
9
9
|
from mteb.abstasks._statistics_calculation import (
|
|
10
10
|
calculate_score_statistics,
|
|
11
11
|
calculate_text_statistics,
|
|
12
12
|
)
|
|
13
13
|
from mteb.abstasks.abstask import AbsTask
|
|
14
|
-
from mteb.models import EncoderProtocol
|
|
15
|
-
from mteb.types import EncodeKwargs
|
|
14
|
+
from mteb.models import EncoderProtocol
|
|
16
15
|
from mteb.types.statistics import (
|
|
17
|
-
ScoreStatistics,
|
|
18
16
|
SplitDescriptiveStatistics,
|
|
19
|
-
TextStatistics,
|
|
20
17
|
)
|
|
21
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from datasets import Dataset
|
|
23
|
+
|
|
24
|
+
from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
|
|
25
|
+
from mteb.models import MTEBModels
|
|
26
|
+
from mteb.types import EncodeKwargs
|
|
27
|
+
from mteb.types.statistics import (
|
|
28
|
+
ScoreStatistics,
|
|
29
|
+
TextStatistics,
|
|
30
|
+
)
|
|
31
|
+
|
|
22
32
|
logger = logging.getLogger(__name__)
|
|
23
33
|
|
|
24
34
|
|
|
@@ -1,19 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import TypedDict
|
|
4
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
4
5
|
|
|
5
6
|
import torch
|
|
6
7
|
from datasets import Dataset
|
|
7
8
|
from sklearn import metrics
|
|
8
9
|
|
|
9
10
|
from mteb._evaluators import ZeroShotClassificationEvaluator
|
|
10
|
-
from mteb.models import EncoderProtocol
|
|
11
|
-
from mteb.types import EncodeKwargs
|
|
11
|
+
from mteb.models import EncoderProtocol
|
|
12
12
|
from mteb.types.statistics import (
|
|
13
|
-
ImageStatistics,
|
|
14
|
-
LabelStatistics,
|
|
15
13
|
SplitDescriptiveStatistics,
|
|
16
|
-
TextStatistics,
|
|
17
14
|
)
|
|
18
15
|
|
|
19
16
|
from ._statistics_calculation import (
|
|
@@ -23,6 +20,17 @@ from ._statistics_calculation import (
|
|
|
23
20
|
)
|
|
24
21
|
from .abstask import AbsTask
|
|
25
22
|
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
from mteb.models import MTEBModels
|
|
27
|
+
from mteb.types import EncodeKwargs
|
|
28
|
+
from mteb.types.statistics import (
|
|
29
|
+
ImageStatistics,
|
|
30
|
+
LabelStatistics,
|
|
31
|
+
TextStatistics,
|
|
32
|
+
)
|
|
33
|
+
|
|
26
34
|
logger = logging.getLogger(__name__)
|
|
27
35
|
|
|
28
36
|
|
mteb/benchmarks/_create_table.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import re
|
|
2
4
|
from collections import defaultdict
|
|
3
|
-
from typing import Literal
|
|
5
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import pandas as pd
|
|
7
9
|
|
|
8
10
|
import mteb
|
|
9
11
|
from mteb.get_tasks import get_task, get_tasks
|
|
10
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from mteb.results.benchmark_results import BenchmarkResults
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
def _borda_count(scores: pd.Series) -> pd.Series:
|
|
@@ -303,6 +307,7 @@ def _create_per_language_table_from_benchmark_results(
|
|
|
303
307
|
|
|
304
308
|
def _create_summary_table_mean_public_private(
|
|
305
309
|
benchmark_results: BenchmarkResults,
|
|
310
|
+
exclude_private_from_borda: bool = False,
|
|
306
311
|
) -> pd.DataFrame:
|
|
307
312
|
"""Create summary table from BenchmarkResults.
|
|
308
313
|
|
|
@@ -311,6 +316,7 @@ def _create_summary_table_mean_public_private(
|
|
|
311
316
|
|
|
312
317
|
Args:
|
|
313
318
|
benchmark_results: BenchmarkResults object containing model results
|
|
319
|
+
exclude_private_from_borda: If True, calculate Borda rank using only public tasks
|
|
314
320
|
|
|
315
321
|
Returns:
|
|
316
322
|
DataFrame with model summaries, ready for styling in the leaderboard
|
|
@@ -356,7 +362,11 @@ def _create_summary_table_mean_public_private(
|
|
|
356
362
|
joint_table = joint_table.drop(models_to_remove, axis=0)
|
|
357
363
|
joint_table.insert(0, "mean(public)", public_mean)
|
|
358
364
|
joint_table.insert(1, "mean(private)", private_mean)
|
|
359
|
-
|
|
365
|
+
if exclude_private_from_borda:
|
|
366
|
+
borda_per_task = per_task[public_task_name]
|
|
367
|
+
else:
|
|
368
|
+
borda_per_task = per_task
|
|
369
|
+
joint_table["borda_rank"] = _get_borda_rank(borda_per_task)
|
|
360
370
|
joint_table = joint_table.sort_values("borda_rank", ascending=True)
|
|
361
371
|
joint_table = joint_table.reset_index()
|
|
362
372
|
|
mteb/benchmarks/benchmark.py
CHANGED
|
@@ -123,9 +123,19 @@ class RtebBenchmark(Benchmark):
|
|
|
123
123
|
_create_summary_table_mean_public_private,
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
-
joint_table = _create_summary_table_mean_public_private(
|
|
126
|
+
joint_table = _create_summary_table_mean_public_private(
|
|
127
|
+
benchmark_results, exclude_private_from_borda=True
|
|
128
|
+
)
|
|
129
|
+
# issue 3902: temporary remove the private column from RTEB summary table
|
|
130
|
+
if "Mean (Private)" in joint_table.columns:
|
|
131
|
+
joint_table = joint_table.drop(columns=["Mean (Private)"])
|
|
127
132
|
# For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
|
|
133
|
+
# but due to 3902, if Private column existed, Mean (Task) was the mean of Public and Private so instead we drop Mean (Task) and rename Mean (Public) to Mean (Task)
|
|
128
134
|
joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
|
|
135
|
+
if "Mean (Task)" in joint_table.columns:
|
|
136
|
+
joint_table = joint_table.drop(columns=["Mean (Task)"])
|
|
137
|
+
joint_table = joint_table.rename(columns={"Mean (Public)": "Mean (Task)"})
|
|
138
|
+
|
|
129
139
|
return joint_table
|
|
130
140
|
|
|
131
141
|
|