mteb 2.7.2__py3-none-any.whl → 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. mteb/_create_dataloaders.py +16 -9
  2. mteb/_evaluators/any_sts_evaluator.py +10 -5
  3. mteb/_evaluators/clustering_evaluator.py +10 -4
  4. mteb/_evaluators/evaluator.py +9 -4
  5. mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
  6. mteb/_evaluators/pair_classification_evaluator.py +10 -5
  7. mteb/_evaluators/retrieval_evaluator.py +19 -13
  8. mteb/_evaluators/retrieval_metrics.py +9 -3
  9. mteb/_evaluators/sklearn_evaluator.py +14 -10
  10. mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
  11. mteb/_evaluators/text/summarization_evaluator.py +8 -4
  12. mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
  13. mteb/_helpful_enum.py +5 -1
  14. mteb/abstasks/_data_filter/filters.py +8 -2
  15. mteb/abstasks/_data_filter/task_pipelines.py +7 -2
  16. mteb/abstasks/_statistics_calculation.py +6 -4
  17. mteb/abstasks/abstask.py +17 -9
  18. mteb/abstasks/aggregate_task_metadata.py +20 -9
  19. mteb/abstasks/aggregated_task.py +15 -8
  20. mteb/abstasks/classification.py +15 -6
  21. mteb/abstasks/clustering.py +17 -8
  22. mteb/abstasks/clustering_legacy.py +14 -6
  23. mteb/abstasks/image/image_text_pair_classification.py +17 -7
  24. mteb/abstasks/multilabel_classification.py +11 -5
  25. mteb/abstasks/pair_classification.py +19 -9
  26. mteb/abstasks/regression.py +14 -6
  27. mteb/abstasks/retrieval.py +27 -16
  28. mteb/abstasks/retrieval_dataset_loaders.py +11 -8
  29. mteb/abstasks/sts.py +19 -10
  30. mteb/abstasks/task_metadata.py +17 -8
  31. mteb/abstasks/text/bitext_mining.py +14 -7
  32. mteb/abstasks/text/summarization.py +17 -7
  33. mteb/abstasks/zeroshot_classification.py +15 -7
  34. mteb/benchmarks/_create_table.py +13 -3
  35. mteb/benchmarks/benchmark.py +11 -1
  36. mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
  37. mteb/cache.py +10 -5
  38. mteb/cli/_display_tasks.py +9 -3
  39. mteb/cli/build_cli.py +5 -2
  40. mteb/cli/generate_model_card.py +9 -2
  41. mteb/deprecated_evaluator.py +16 -12
  42. mteb/evaluate.py +20 -18
  43. mteb/filter_tasks.py +12 -7
  44. mteb/get_tasks.py +9 -4
  45. mteb/languages/language_scripts.py +8 -3
  46. mteb/leaderboard/app.py +7 -3
  47. mteb/leaderboard/table.py +7 -2
  48. mteb/load_results.py +9 -3
  49. mteb/models/abs_encoder.py +22 -12
  50. mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
  51. mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
  52. mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
  53. mteb/models/cache_wrappers/cache_wrapper.py +14 -9
  54. mteb/models/get_model_meta.py +11 -4
  55. mteb/models/instruct_wrapper.py +13 -5
  56. mteb/models/model_implementations/align_models.py +9 -4
  57. mteb/models/model_implementations/bedrock_models.py +16 -6
  58. mteb/models/model_implementations/blip2_models.py +9 -4
  59. mteb/models/model_implementations/blip_models.py +9 -4
  60. mteb/models/model_implementations/bm25.py +15 -10
  61. mteb/models/model_implementations/bmretriever_models.py +6 -2
  62. mteb/models/model_implementations/cde_models.py +9 -5
  63. mteb/models/model_implementations/clip_models.py +9 -4
  64. mteb/models/model_implementations/cohere_models.py +10 -4
  65. mteb/models/model_implementations/cohere_v.py +9 -4
  66. mteb/models/model_implementations/colpali_models.py +4 -3
  67. mteb/models/model_implementations/colqwen_models.py +10 -31
  68. mteb/models/model_implementations/colsmol_models.py +1 -1
  69. mteb/models/model_implementations/conan_models.py +10 -4
  70. mteb/models/model_implementations/dino_models.py +9 -4
  71. mteb/models/model_implementations/e5_v.py +9 -4
  72. mteb/models/model_implementations/eagerworks_models.py +10 -4
  73. mteb/models/model_implementations/evaclip_models.py +9 -4
  74. mteb/models/model_implementations/gme_v_models.py +5 -3
  75. mteb/models/model_implementations/google_models.py +10 -4
  76. mteb/models/model_implementations/granite_vision_embedding_models.py +6 -5
  77. mteb/models/model_implementations/hinvec_models.py +5 -1
  78. mteb/models/model_implementations/jasper_models.py +12 -5
  79. mteb/models/model_implementations/jina_clip.py +9 -4
  80. mteb/models/model_implementations/jina_models.py +10 -5
  81. mteb/models/model_implementations/kalm_models.py +18 -12
  82. mteb/models/model_implementations/linq_models.py +6 -1
  83. mteb/models/model_implementations/listconranker.py +9 -4
  84. mteb/models/model_implementations/llm2clip_models.py +9 -4
  85. mteb/models/model_implementations/llm2vec_models.py +12 -6
  86. mteb/models/model_implementations/mcinext_models.py +5 -2
  87. mteb/models/model_implementations/moco_models.py +9 -4
  88. mteb/models/model_implementations/mod_models.py +1 -1
  89. mteb/models/model_implementations/model2vec_models.py +10 -4
  90. mteb/models/model_implementations/no_instruct_sentence_models.py +12 -5
  91. mteb/models/model_implementations/nomic_models.py +10 -4
  92. mteb/models/model_implementations/nomic_models_vision.py +4 -3
  93. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +7 -3
  94. mteb/models/model_implementations/nvidia_models.py +12 -4
  95. mteb/models/model_implementations/octen_models.py +1 -1
  96. mteb/models/model_implementations/openai_models.py +9 -4
  97. mteb/models/model_implementations/openclip_models.py +9 -4
  98. mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -4
  99. mteb/models/model_implementations/ops_moa_models.py +7 -2
  100. mteb/models/model_implementations/promptriever_models.py +12 -6
  101. mteb/models/model_implementations/pylate_models.py +19 -13
  102. mteb/models/model_implementations/qwen3_models.py +8 -1
  103. mteb/models/model_implementations/random_baseline.py +4 -3
  104. mteb/models/model_implementations/repllama_models.py +13 -6
  105. mteb/models/model_implementations/rerankers_custom.py +10 -4
  106. mteb/models/model_implementations/rerankers_monot5_based.py +10 -4
  107. mteb/models/model_implementations/salesforce_models.py +7 -1
  108. mteb/models/model_implementations/seed_1_6_embedding_models.py +4 -2
  109. mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +5 -2
  110. mteb/models/model_implementations/seed_models.py +1 -1
  111. mteb/models/model_implementations/siglip_models.py +9 -4
  112. mteb/models/model_implementations/slm_models.py +7 -4
  113. mteb/models/model_implementations/uae_models.py +9 -4
  114. mteb/models/model_implementations/vdr_models.py +7 -1
  115. mteb/models/model_implementations/vista_models.py +9 -4
  116. mteb/models/model_implementations/vlm2vec_models.py +9 -4
  117. mteb/models/model_implementations/voyage_models.py +10 -4
  118. mteb/models/model_implementations/voyage_v.py +10 -6
  119. mteb/models/model_implementations/yuan_models_en.py +1 -1
  120. mteb/models/model_meta.py +12 -7
  121. mteb/models/models_protocols.py +19 -18
  122. mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
  123. mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
  124. mteb/models/search_wrappers.py +19 -12
  125. mteb/models/sentence_transformer_wrapper.py +4 -3
  126. mteb/models/vllm_wrapper.py +8 -6
  127. mteb/results/benchmark_results.py +22 -17
  128. mteb/results/model_result.py +21 -15
  129. mteb/results/task_result.py +15 -9
  130. mteb/similarity_functions.py +8 -2
  131. mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
  132. mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
  133. mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
  134. mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
  135. mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
  136. mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
  137. mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
  138. mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
  139. mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
  140. mteb/tasks/clustering/nob/snl_clustering.py +7 -2
  141. mteb/tasks/clustering/nob/vg_clustering.py +7 -2
  142. mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
  143. mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
  144. mteb/types/_encoder_io.py +1 -1
  145. mteb/types/statistics.py +9 -2
  146. {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/METADATA +1 -1
  147. {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/RECORD +151 -151
  148. {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/WHEEL +0 -0
  149. {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/entry_points.txt +0 -0
  150. {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/licenses/LICENSE +0 -0
  151. {mteb-2.7.2.dist-info → mteb-2.7.3.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import itertools
2
4
  import logging
3
5
  import random
4
6
  from collections import defaultdict
5
- from pathlib import Path
6
- from typing import Any, cast
7
+ from typing import TYPE_CHECKING, Any, cast
7
8
 
8
9
  import numpy as np
9
10
  from datasets import Dataset, DatasetDict
@@ -11,13 +12,10 @@ from sklearn.cluster import MiniBatchKMeans
11
12
  from sklearn.metrics.cluster import v_measure_score
12
13
 
13
14
  from mteb._create_dataloaders import create_dataloader
14
- from mteb.models import EncoderProtocol, MTEBModels
15
- from mteb.types import Array, EncodeKwargs, HFSubset, ScoresDict
15
+ from mteb.models import EncoderProtocol
16
+ from mteb.types import Array, HFSubset
16
17
  from mteb.types.statistics import (
17
- ImageStatistics,
18
- LabelStatistics,
19
18
  SplitDescriptiveStatistics,
20
- TextStatistics,
21
19
  )
22
20
 
23
21
  from ._statistics_calculation import (
@@ -27,6 +25,17 @@ from ._statistics_calculation import (
27
25
  )
28
26
  from .abstask import AbsTask
29
27
 
28
+ if TYPE_CHECKING:
29
+ from pathlib import Path
30
+
31
+ from mteb.models import MTEBModels
32
+ from mteb.types import Array, EncodeKwargs, ScoresDict
33
+ from mteb.types.statistics import (
34
+ ImageStatistics,
35
+ LabelStatistics,
36
+ TextStatistics,
37
+ )
38
+
30
39
  logger = logging.getLogger(__name__)
31
40
 
32
41
 
@@ -186,7 +195,7 @@ class AbsTaskClustering(AbsTask):
186
195
  self.max_fraction_of_documents_to_embed * len(data_split)
187
196
  )
188
197
  else:
189
- max_documents_to_embed = cast(int, self.max_document_to_embed)
198
+ max_documents_to_embed = cast("int", self.max_document_to_embed)
190
199
 
191
200
  max_documents_to_embed = min(len(data_split), max_documents_to_embed)
192
201
  example_indices = self.rng_state.sample(
@@ -1,6 +1,7 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from pathlib import Path
3
- from typing import Any, TypedDict
4
+ from typing import TYPE_CHECKING, Any, TypedDict
4
5
 
5
6
  import numpy as np
6
7
  from datasets import Dataset
@@ -9,12 +10,8 @@ from sklearn import metrics
9
10
 
10
11
  from mteb._evaluators import ClusteringEvaluator
11
12
  from mteb.models import EncoderProtocol, MTEBModels
12
- from mteb.types import EncodeKwargs, ScoresDict
13
13
  from mteb.types.statistics import (
14
- ImageStatistics,
15
- LabelStatistics,
16
14
  SplitDescriptiveStatistics,
17
- TextStatistics,
18
15
  )
19
16
 
20
17
  from ._statistics_calculation import (
@@ -24,6 +21,17 @@ from ._statistics_calculation import (
24
21
  )
25
22
  from .abstask import AbsTask
26
23
 
24
+ if TYPE_CHECKING:
25
+ from pathlib import Path
26
+
27
+ from mteb.models import MTEBModels
28
+ from mteb.types import EncodeKwargs, ScoresDict
29
+ from mteb.types.statistics import (
30
+ ImageStatistics,
31
+ LabelStatistics,
32
+ TextStatistics,
33
+ )
34
+
27
35
  logger = logging.getLogger(__name__)
28
36
 
29
37
 
@@ -1,10 +1,11 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from collections.abc import Sequence
3
- from pathlib import Path
4
- from typing import Any, TypedDict
5
+ from typing import TYPE_CHECKING, Any, TypedDict
5
6
 
6
7
  import torch
7
- from datasets import Dataset, concatenate_datasets
8
+ from datasets import concatenate_datasets
8
9
 
9
10
  from mteb._evaluators import ImageTextPairClassificationEvaluator
10
11
  from mteb.abstasks._statistics_calculation import (
@@ -12,14 +13,23 @@ from mteb.abstasks._statistics_calculation import (
12
13
  calculate_text_statistics,
13
14
  )
14
15
  from mteb.abstasks.abstask import AbsTask
15
- from mteb.models.models_protocols import EncoderProtocol, MTEBModels
16
- from mteb.types import EncodeKwargs
16
+ from mteb.models.models_protocols import EncoderProtocol
17
17
  from mteb.types.statistics import (
18
- ImageStatistics,
19
18
  SplitDescriptiveStatistics,
20
- TextStatistics,
21
19
  )
22
20
 
21
+ if TYPE_CHECKING:
22
+ from pathlib import Path
23
+
24
+ from datasets import Dataset
25
+
26
+ from mteb.models.models_protocols import MTEBModels
27
+ from mteb.types import EncodeKwargs
28
+ from mteb.types.statistics import (
29
+ ImageStatistics,
30
+ TextStatistics,
31
+ )
32
+
23
33
  logger = logging.getLogger(__name__)
24
34
 
25
35
 
@@ -1,8 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import itertools
2
4
  import logging
3
5
  from collections import defaultdict
4
- from pathlib import Path
5
- from typing import Any, TypedDict
6
+ from typing import TYPE_CHECKING, Any, TypedDict
6
7
 
7
8
  import numpy as np
8
9
  from datasets import DatasetDict
@@ -15,12 +16,17 @@ from typing_extensions import override
15
16
 
16
17
  from mteb._create_dataloaders import create_dataloader
17
18
  from mteb._evaluators.classification_metrics import hamming_score
18
- from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
19
- from mteb.models import EncoderProtocol, MTEBModels
20
- from mteb.types import Array, EncodeKwargs
19
+ from mteb.models import EncoderProtocol
21
20
 
22
21
  from .classification import AbsTaskClassification
23
22
 
23
+ if TYPE_CHECKING:
24
+ from pathlib import Path
25
+
26
+ from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
27
+ from mteb.models import MTEBModels
28
+ from mteb.types import Array, EncodeKwargs
29
+
24
30
  logger = logging.getLogger(__name__)
25
31
 
26
32
 
@@ -1,16 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import hashlib
2
4
  import logging
3
5
  from collections import defaultdict
4
- from pathlib import Path
6
+ from typing import TYPE_CHECKING
5
7
 
6
8
  import numpy as np
7
9
  from datasets import Dataset
8
10
  from sklearn.metrics import average_precision_score
9
11
 
10
12
  from mteb._evaluators import PairClassificationEvaluator
11
- from mteb._evaluators.pair_classification_evaluator import (
12
- PairClassificationDistances,
13
- )
14
13
  from mteb.abstasks._statistics_calculation import (
15
14
  calculate_image_statistics,
16
15
  calculate_label_statistics,
@@ -18,15 +17,26 @@ from mteb.abstasks._statistics_calculation import (
18
17
  )
19
18
  from mteb.abstasks.abstask import AbsTask
20
19
  from mteb.models.model_meta import ScoringFunction
21
- from mteb.models.models_protocols import EncoderProtocol, MTEBModels
22
- from mteb.types import EncodeKwargs, PromptType
20
+ from mteb.models.models_protocols import EncoderProtocol
23
21
  from mteb.types.statistics import (
24
- ImageStatistics,
25
- LabelStatistics,
26
22
  SplitDescriptiveStatistics,
27
- TextStatistics,
28
23
  )
29
24
 
25
+ if TYPE_CHECKING:
26
+ from pathlib import Path
27
+
28
+ from mteb._evaluators.pair_classification_evaluator import (
29
+ PairClassificationDistances,
30
+ )
31
+ from mteb.models.models_protocols import MTEBModels
32
+ from mteb.types import EncodeKwargs, PromptType
33
+ from mteb.types.statistics import (
34
+ ImageStatistics,
35
+ LabelStatistics,
36
+ TextStatistics,
37
+ )
38
+
39
+
30
40
  logger = logging.getLogger(__name__)
31
41
 
32
42
 
@@ -1,29 +1,37 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import TypedDict
4
+ from typing import TYPE_CHECKING, TypedDict
3
5
 
4
6
  import datasets
5
7
  import numpy as np
6
8
  import pandas as pd
7
- from datasets import Dataset
8
9
  from scipy.stats import kendalltau
9
10
  from sklearn.linear_model import LinearRegression
10
11
  from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
11
12
 
12
- from mteb._evaluators.sklearn_evaluator import SklearnEvaluator, SklearnModelProtocol
13
+ from mteb._evaluators.sklearn_evaluator import SklearnEvaluator
13
14
  from mteb.abstasks._statistics_calculation import (
14
15
  calculate_image_statistics,
15
16
  calculate_score_statistics,
16
17
  calculate_text_statistics,
17
18
  )
18
19
  from mteb.types.statistics import (
19
- ImageStatistics,
20
- ScoreStatistics,
21
20
  SplitDescriptiveStatistics,
22
- TextStatistics,
23
21
  )
24
22
 
25
23
  from .classification import AbsTaskClassification
26
24
 
25
+ if TYPE_CHECKING:
26
+ from datasets import Dataset
27
+
28
+ from mteb._evaluators.sklearn_evaluator import SklearnModelProtocol
29
+ from mteb.types.statistics import (
30
+ ImageStatistics,
31
+ ScoreStatistics,
32
+ TextStatistics,
33
+ )
34
+
27
35
  logger = logging.getLogger(__name__)
28
36
 
29
37
 
@@ -1,13 +1,13 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  from collections import defaultdict
4
- from collections.abc import Callable, Mapping, Sequence
5
6
  from pathlib import Path
6
7
  from time import time
7
- from typing import Any, Literal
8
+ from typing import TYPE_CHECKING, Any, Literal
8
9
 
9
10
  from datasets import Dataset, DatasetDict, concatenate_datasets
10
- from typing_extensions import Self
11
11
 
12
12
  from mteb._create_dataloaders import (
13
13
  _combine_queries_with_instruction_text,
@@ -19,25 +19,12 @@ from mteb._evaluators.retrieval_metrics import make_score_dict
19
19
  from mteb.models import (
20
20
  CrossEncoderProtocol,
21
21
  EncoderProtocol,
22
- MTEBModels,
23
22
  SearchCrossEncoderWrapper,
24
23
  SearchEncoderWrapper,
25
24
  SearchProtocol,
26
25
  )
27
- from mteb.types import (
28
- EncodeKwargs,
29
- HFSubset,
30
- QueryDatasetType,
31
- RelevantDocumentsType,
32
- RetrievalOutputType,
33
- ScoresDict,
34
- )
35
26
  from mteb.types.statistics import (
36
- ImageStatistics,
37
- RelevantDocsStatistics,
38
27
  SplitDescriptiveStatistics,
39
- TextStatistics,
40
- TopRankedStatistics,
41
28
  )
42
29
 
43
30
  from ._statistics_calculation import (
@@ -53,6 +40,30 @@ from .retrieval_dataset_loaders import (
53
40
  _combine_queries_with_instructions_datasets,
54
41
  )
55
42
 
43
+ if TYPE_CHECKING:
44
+ from collections.abc import Callable, Mapping, Sequence
45
+
46
+ from typing_extensions import Self
47
+
48
+ from mteb.models import (
49
+ MTEBModels,
50
+ )
51
+ from mteb.types import (
52
+ EncodeKwargs,
53
+ HFSubset,
54
+ QueryDatasetType,
55
+ RelevantDocumentsType,
56
+ RetrievalOutputType,
57
+ ScoresDict,
58
+ )
59
+ from mteb.types.statistics import (
60
+ ImageStatistics,
61
+ RelevantDocsStatistics,
62
+ TextStatistics,
63
+ TopRankedStatistics,
64
+ )
65
+
66
+
56
67
  logger = logging.getLogger(__name__)
57
68
 
58
69
 
@@ -1,5 +1,7 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from typing import TypedDict
4
+ from typing import TYPE_CHECKING, TypedDict
3
5
 
4
6
  from datasets import (
5
7
  Dataset,
@@ -11,13 +13,14 @@ from datasets import (
11
13
  load_dataset,
12
14
  )
13
15
 
14
- from mteb.types import (
15
- CorpusDatasetType,
16
- InstructionDatasetType,
17
- QueryDatasetType,
18
- RelevantDocumentsType,
19
- TopRankedDocumentsType,
20
- )
16
+ if TYPE_CHECKING:
17
+ from mteb.types import (
18
+ CorpusDatasetType,
19
+ InstructionDatasetType,
20
+ QueryDatasetType,
21
+ RelevantDocumentsType,
22
+ TopRankedDocumentsType,
23
+ )
21
24
 
22
25
  logger = logging.getLogger(__name__)
23
26
 
mteb/abstasks/sts.py CHANGED
@@ -1,19 +1,14 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from pathlib import Path
3
- from typing import Any, TypedDict, cast
4
+ from typing import TYPE_CHECKING, Any, TypedDict, cast
4
5
 
5
- from datasets import Dataset
6
6
  from scipy.stats import pearsonr, spearmanr
7
7
 
8
8
  from mteb._evaluators import AnySTSEvaluator
9
- from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
10
- from mteb.models import EncoderProtocol, MTEBModels
11
- from mteb.types import EncodeKwargs, PromptType
9
+ from mteb.models import EncoderProtocol
12
10
  from mteb.types.statistics import (
13
- ImageStatistics,
14
- ScoreStatistics,
15
11
  SplitDescriptiveStatistics,
16
- TextStatistics,
17
12
  )
18
13
 
19
14
  from ._statistics_calculation import (
@@ -23,6 +18,20 @@ from ._statistics_calculation import (
23
18
  )
24
19
  from .abstask import AbsTask
25
20
 
21
+ if TYPE_CHECKING:
22
+ from pathlib import Path
23
+
24
+ from datasets import Dataset
25
+
26
+ from mteb._evaluators.any_sts_evaluator import STSEvaluatorScores
27
+ from mteb.models import MTEBModels
28
+ from mteb.types import EncodeKwargs, PromptType
29
+ from mteb.types.statistics import (
30
+ ImageStatistics,
31
+ ScoreStatistics,
32
+ TextStatistics,
33
+ )
34
+
26
35
  logger = logging.getLogger(__name__)
27
36
 
28
37
 
@@ -182,7 +191,7 @@ class AbsTaskSTS(AbsTask):
182
191
  self, split: str, hf_subset: str | None = None, compute_overall: bool = False
183
192
  ) -> AnySTSDescriptiveStatistics:
184
193
  first_column, second_column = self.column_names
185
- self.dataset = cast(dict[str, dict[str, Dataset]], self.dataset)
194
+ self.dataset = cast("dict[str, dict[str, Dataset]]", self.dataset)
186
195
 
187
196
  if hf_subset:
188
197
  sentence1 = self.dataset[hf_subset][split][first_column]
@@ -1,11 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  from collections.abc import Sequence
4
6
  from pathlib import Path
5
- from typing import Any, Literal, cast
7
+ from typing import TYPE_CHECKING, Any, Literal, cast
6
8
 
7
9
  from huggingface_hub import (
8
- CardData,
9
10
  DatasetCard,
10
11
  DatasetCardData,
11
12
  constants,
@@ -17,13 +18,11 @@ from pydantic import (
17
18
  ConfigDict,
18
19
  field_validator,
19
20
  )
20
- from typing_extensions import Required, TypedDict
21
+ from typing_extensions import Required, TypedDict # noqa: TC002
21
22
 
22
23
  import mteb
23
24
  from mteb.languages import check_language_code
24
25
  from mteb.types import (
25
- HFSubset,
26
- ISOLanguageScript,
27
26
  Languages,
28
27
  Licenses,
29
28
  Modalities,
@@ -31,7 +30,17 @@ from mteb.types import (
31
30
  StrDate,
32
31
  StrURL,
33
32
  )
34
- from mteb.types.statistics import DescriptiveStatistics
33
+
34
+ if TYPE_CHECKING:
35
+ from huggingface_hub import (
36
+ CardData,
37
+ )
38
+
39
+ from mteb.types import (
40
+ HFSubset,
41
+ ISOLanguageScript,
42
+ )
43
+ from mteb.types.statistics import DescriptiveStatistics
35
44
 
36
45
  logger = logging.getLogger(__name__)
37
46
 
@@ -368,7 +377,7 @@ class TaskMetadata(BaseModel):
368
377
  """Return a dictionary mapping huggingface subsets to languages."""
369
378
  if isinstance(self.eval_langs, dict):
370
379
  return self.eval_langs
371
- return {"default": cast(list[str], self.eval_langs)}
380
+ return {"default": cast("list[str]", self.eval_langs)}
372
381
 
373
382
  @property
374
383
  def intext_citation(self, include_cite: bool = True) -> str:
@@ -697,7 +706,7 @@ class TaskMetadata(BaseModel):
697
706
  for val in self.eval_langs.values():
698
707
  languages.extend(val)
699
708
  else:
700
- languages = cast(list[str], self.eval_langs)
709
+ languages = cast("list[str]", self.eval_langs)
701
710
  # value "python" is not valid. It must be an ISO 639-1, 639-2 or 639-3 code (two/three letters),
702
711
  # or a special value like "code", "multilingual".
703
712
  readme_langs = []
@@ -1,7 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from collections import defaultdict
3
- from pathlib import Path
4
- from typing import Any, ClassVar, TypedDict, cast
5
+ from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast
5
6
 
6
7
  from datasets import Dataset, DatasetDict
7
8
  from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
@@ -9,9 +10,15 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_sc
9
10
  from mteb._evaluators import BitextMiningEvaluator
10
11
  from mteb.abstasks._statistics_calculation import calculate_text_statistics
11
12
  from mteb.abstasks.abstask import AbsTask
12
- from mteb.models import EncoderProtocol, MTEBModels
13
- from mteb.types import EncodeKwargs, HFSubset, ScoresDict
14
- from mteb.types.statistics import SplitDescriptiveStatistics, TextStatistics
13
+ from mteb.models import EncoderProtocol
14
+ from mteb.types.statistics import SplitDescriptiveStatistics
15
+
16
+ if TYPE_CHECKING:
17
+ from pathlib import Path
18
+
19
+ from mteb.models import MTEBModels
20
+ from mteb.types import EncodeKwargs, HFSubset, ScoresDict
21
+ from mteb.types.statistics import TextStatistics
15
22
 
16
23
  logger = logging.getLogger(__name__)
17
24
 
@@ -90,7 +97,7 @@ class AbsTaskBitextMining(AbsTask):
90
97
  if subsets_to_run is not None:
91
98
  hf_subsets = [s for s in hf_subsets if s in subsets_to_run]
92
99
 
93
- encoder_model = cast(EncoderProtocol, model)
100
+ encoder_model = cast("EncoderProtocol", model)
94
101
 
95
102
  if self.dataset is None:
96
103
  raise ValueError("Dataset is not loaded.")
@@ -127,7 +134,7 @@ class AbsTaskBitextMining(AbsTask):
127
134
  **kwargs,
128
135
  )
129
136
 
130
- return cast(dict[HFSubset, ScoresDict], scores)
137
+ return cast("dict[HFSubset, ScoresDict]", scores)
131
138
 
132
139
  def _get_pairs(self, parallel: bool) -> list[tuple[str, str]]:
133
140
  pairs = self._DEFAULT_PAIR
@@ -1,24 +1,34 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from pathlib import Path
4
+ from typing import TYPE_CHECKING
3
5
 
4
6
  import numpy as np
5
- from datasets import Dataset
6
7
 
7
8
  from mteb._evaluators import SummarizationEvaluator
8
- from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
9
9
  from mteb.abstasks._statistics_calculation import (
10
10
  calculate_score_statistics,
11
11
  calculate_text_statistics,
12
12
  )
13
13
  from mteb.abstasks.abstask import AbsTask
14
- from mteb.models import EncoderProtocol, MTEBModels
15
- from mteb.types import EncodeKwargs
14
+ from mteb.models import EncoderProtocol
16
15
  from mteb.types.statistics import (
17
- ScoreStatistics,
18
16
  SplitDescriptiveStatistics,
19
- TextStatistics,
20
17
  )
21
18
 
19
+ if TYPE_CHECKING:
20
+ from pathlib import Path
21
+
22
+ from datasets import Dataset
23
+
24
+ from mteb._evaluators.text.summarization_evaluator import SummarizationMetrics
25
+ from mteb.models import MTEBModels
26
+ from mteb.types import EncodeKwargs
27
+ from mteb.types.statistics import (
28
+ ScoreStatistics,
29
+ TextStatistics,
30
+ )
31
+
22
32
  logger = logging.getLogger(__name__)
23
33
 
24
34
 
@@ -1,19 +1,16 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
- from pathlib import Path
3
- from typing import TypedDict
4
+ from typing import TYPE_CHECKING, TypedDict
4
5
 
5
6
  import torch
6
7
  from datasets import Dataset
7
8
  from sklearn import metrics
8
9
 
9
10
  from mteb._evaluators import ZeroShotClassificationEvaluator
10
- from mteb.models import EncoderProtocol, MTEBModels
11
- from mteb.types import EncodeKwargs
11
+ from mteb.models import EncoderProtocol
12
12
  from mteb.types.statistics import (
13
- ImageStatistics,
14
- LabelStatistics,
15
13
  SplitDescriptiveStatistics,
16
- TextStatistics,
17
14
  )
18
15
 
19
16
  from ._statistics_calculation import (
@@ -23,6 +20,17 @@ from ._statistics_calculation import (
23
20
  )
24
21
  from .abstask import AbsTask
25
22
 
23
+ if TYPE_CHECKING:
24
+ from pathlib import Path
25
+
26
+ from mteb.models import MTEBModels
27
+ from mteb.types import EncodeKwargs
28
+ from mteb.types.statistics import (
29
+ ImageStatistics,
30
+ LabelStatistics,
31
+ TextStatistics,
32
+ )
33
+
26
34
  logger = logging.getLogger(__name__)
27
35
 
28
36
 
@@ -1,13 +1,17 @@
1
+ from __future__ import annotations
2
+
1
3
  import re
2
4
  from collections import defaultdict
3
- from typing import Literal
5
+ from typing import TYPE_CHECKING, Literal
4
6
 
5
7
  import numpy as np
6
8
  import pandas as pd
7
9
 
8
10
  import mteb
9
11
  from mteb.get_tasks import get_task, get_tasks
10
- from mteb.results.benchmark_results import BenchmarkResults
12
+
13
+ if TYPE_CHECKING:
14
+ from mteb.results.benchmark_results import BenchmarkResults
11
15
 
12
16
 
13
17
  def _borda_count(scores: pd.Series) -> pd.Series:
@@ -303,6 +307,7 @@ def _create_per_language_table_from_benchmark_results(
303
307
 
304
308
  def _create_summary_table_mean_public_private(
305
309
  benchmark_results: BenchmarkResults,
310
+ exclude_private_from_borda: bool = False,
306
311
  ) -> pd.DataFrame:
307
312
  """Create summary table from BenchmarkResults.
308
313
 
@@ -311,6 +316,7 @@ def _create_summary_table_mean_public_private(
311
316
 
312
317
  Args:
313
318
  benchmark_results: BenchmarkResults object containing model results
319
+ exclude_private_from_borda: If True, calculate Borda rank using only public tasks
314
320
 
315
321
  Returns:
316
322
  DataFrame with model summaries, ready for styling in the leaderboard
@@ -356,7 +362,11 @@ def _create_summary_table_mean_public_private(
356
362
  joint_table = joint_table.drop(models_to_remove, axis=0)
357
363
  joint_table.insert(0, "mean(public)", public_mean)
358
364
  joint_table.insert(1, "mean(private)", private_mean)
359
- joint_table["borda_rank"] = _get_borda_rank(per_task)
365
+ if exclude_private_from_borda:
366
+ borda_per_task = per_task[public_task_name]
367
+ else:
368
+ borda_per_task = per_task
369
+ joint_table["borda_rank"] = _get_borda_rank(borda_per_task)
360
370
  joint_table = joint_table.sort_values("borda_rank", ascending=True)
361
371
  joint_table = joint_table.reset_index()
362
372
 
@@ -123,9 +123,19 @@ class RtebBenchmark(Benchmark):
123
123
  _create_summary_table_mean_public_private,
124
124
  )
125
125
 
126
- joint_table = _create_summary_table_mean_public_private(benchmark_results)
126
+ joint_table = _create_summary_table_mean_public_private(
127
+ benchmark_results, exclude_private_from_borda=True
128
+ )
129
+ # issue 3902: temporary remove the private column from RTEB summary table
130
+ if "Mean (Private)" in joint_table.columns:
131
+ joint_table = joint_table.drop(columns=["Mean (Private)"])
127
132
  # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
133
+ # but due to 3902, if Private column existed, Mean (Task) was the mean of Public and Private so instead we drop Mean (Task) and rename Mean (Public) to Mean (Task)
128
134
  joint_table = joint_table.rename(columns={"Retrieval": "Mean (Task)"})
135
+ if "Mean (Task)" in joint_table.columns:
136
+ joint_table = joint_table.drop(columns=["Mean (Task)"])
137
+ joint_table = joint_table.rename(columns={"Mean (Public)": "Mean (Task)"})
138
+
129
139
  return joint_table
130
140
 
131
141