mteb 2.7.2__py3-none-any.whl → 2.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_create_dataloaders.py +16 -9
- mteb/_evaluators/any_sts_evaluator.py +10 -5
- mteb/_evaluators/clustering_evaluator.py +10 -4
- mteb/_evaluators/evaluator.py +9 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
- mteb/_evaluators/pair_classification_evaluator.py +10 -5
- mteb/_evaluators/retrieval_evaluator.py +19 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +14 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
- mteb/_evaluators/text/summarization_evaluator.py +8 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +17 -9
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +15 -6
- mteb/abstasks/clustering.py +17 -8
- mteb/abstasks/clustering_legacy.py +14 -6
- mteb/abstasks/image/image_text_pair_classification.py +17 -7
- mteb/abstasks/multilabel_classification.py +11 -5
- mteb/abstasks/pair_classification.py +19 -9
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +28 -17
- mteb/abstasks/retrieval_dataset_loaders.py +11 -8
- mteb/abstasks/sts.py +19 -10
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +14 -7
- mteb/abstasks/text/summarization.py +17 -7
- mteb/abstasks/zeroshot_classification.py +15 -7
- mteb/benchmarks/_create_table.py +13 -3
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +41 -2
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +10 -5
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/descriptive_stats/Retrieval/BrightAopsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightBiologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEarthScienceRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightEconomicsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightLeetcodeRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPonyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightPsychologyRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightRoboticsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightStackoverflowRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingLongRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightSustainableLivingRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQAQuestionsRetrieval.json +35 -0
- mteb/descriptive_stats/Retrieval/BrightTheoremQATheoremsRetrieval.json +35 -0
- mteb/evaluate.py +20 -18
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +7 -3
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +11 -4
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +10 -4
- mteb/models/model_implementations/amazon_models.py +1 -0
- mteb/models/model_implementations/andersborges.py +2 -0
- mteb/models/model_implementations/ara_models.py +1 -0
- mteb/models/model_implementations/arctic_models.py +8 -0
- mteb/models/model_implementations/b1ade_models.py +1 -0
- mteb/models/model_implementations/bedrock_models.py +20 -6
- mteb/models/model_implementations/bge_models.py +40 -1
- mteb/models/model_implementations/bica_model.py +1 -0
- mteb/models/model_implementations/blip2_models.py +11 -4
- mteb/models/model_implementations/blip_models.py +17 -4
- mteb/models/model_implementations/bm25.py +22 -14
- mteb/models/model_implementations/bmretriever_models.py +10 -2
- mteb/models/model_implementations/cadet_models.py +1 -0
- mteb/models/model_implementations/cde_models.py +11 -5
- mteb/models/model_implementations/clip_models.py +12 -4
- mteb/models/model_implementations/clips_models.py +3 -0
- mteb/models/model_implementations/codefuse_models.py +5 -0
- mteb/models/model_implementations/codesage_models.py +3 -0
- mteb/models/model_implementations/cohere_models.py +14 -4
- mteb/models/model_implementations/cohere_v.py +14 -4
- mteb/models/model_implementations/colpali_models.py +7 -3
- mteb/models/model_implementations/colqwen_models.py +17 -31
- mteb/models/model_implementations/colsmol_models.py +3 -1
- mteb/models/model_implementations/conan_models.py +11 -4
- mteb/models/model_implementations/dino_models.py +28 -4
- mteb/models/model_implementations/e5_instruct.py +4 -0
- mteb/models/model_implementations/e5_models.py +9 -0
- mteb/models/model_implementations/e5_v.py +10 -4
- mteb/models/model_implementations/eagerworks_models.py +11 -4
- mteb/models/model_implementations/emillykkejensen_models.py +3 -0
- mteb/models/model_implementations/en_code_retriever.py +1 -0
- mteb/models/model_implementations/euler_models.py +1 -0
- mteb/models/model_implementations/evaclip_models.py +13 -4
- mteb/models/model_implementations/fa_models.py +9 -0
- mteb/models/model_implementations/facebookai.py +2 -0
- mteb/models/model_implementations/geogpt_models.py +1 -0
- mteb/models/model_implementations/gme_v_models.py +7 -3
- mteb/models/model_implementations/google_models.py +15 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +7 -5
- mteb/models/model_implementations/gritlm_models.py +2 -0
- mteb/models/model_implementations/gte_models.py +9 -0
- mteb/models/model_implementations/hinvec_models.py +6 -1
- mteb/models/model_implementations/human.py +1 -0
- mteb/models/model_implementations/ibm_granite_models.py +6 -0
- mteb/models/model_implementations/inf_models.py +2 -0
- mteb/models/model_implementations/jasper_models.py +14 -5
- mteb/models/model_implementations/jina_clip.py +10 -4
- mteb/models/model_implementations/jina_models.py +17 -5
- mteb/models/model_implementations/kalm_models.py +24 -12
- mteb/models/model_implementations/kblab.py +1 -0
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -0
- mteb/models/model_implementations/kfst.py +1 -0
- mteb/models/model_implementations/kowshik24_models.py +1 -0
- mteb/models/model_implementations/lens_models.py +2 -0
- mteb/models/model_implementations/lgai_embedding_models.py +1 -0
- mteb/models/model_implementations/linq_models.py +7 -1
- mteb/models/model_implementations/listconranker.py +10 -4
- mteb/models/model_implementations/llm2clip_models.py +12 -4
- mteb/models/model_implementations/llm2vec_models.py +20 -6
- mteb/models/model_implementations/mcinext_models.py +8 -2
- mteb/models/model_implementations/mdbr_models.py +2 -0
- mteb/models/model_implementations/misc_models.py +63 -0
- mteb/models/model_implementations/mixedbread_ai_models.py +3 -0
- mteb/models/model_implementations/mme5_models.py +2 -1
- mteb/models/model_implementations/moco_models.py +11 -4
- mteb/models/model_implementations/mod_models.py +2 -1
- mteb/models/model_implementations/model2vec_models.py +23 -4
- mteb/models/model_implementations/moka_models.py +3 -0
- mteb/models/model_implementations/nbailab.py +3 -0
- mteb/models/model_implementations/no_instruct_sentence_models.py +13 -5
- mteb/models/model_implementations/nomic_models.py +16 -4
- mteb/models/model_implementations/nomic_models_vision.py +5 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +9 -3
- mteb/models/model_implementations/nvidia_models.py +15 -4
- mteb/models/model_implementations/octen_models.py +3 -1
- mteb/models/model_implementations/openai_models.py +14 -4
- mteb/models/model_implementations/openclip_models.py +17 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +15 -4
- mteb/models/model_implementations/ops_moa_models.py +9 -2
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -0
- mteb/models/model_implementations/pawan_models.py +1 -0
- mteb/models/model_implementations/piccolo_models.py +2 -0
- mteb/models/model_implementations/promptriever_models.py +16 -6
- mteb/models/model_implementations/pylate_models.py +22 -13
- mteb/models/model_implementations/qodo_models.py +2 -0
- mteb/models/model_implementations/qtack_models.py +1 -0
- mteb/models/model_implementations/qwen3_models.py +11 -1
- mteb/models/model_implementations/qzhou_models.py +2 -0
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/rasgaard_models.py +1 -0
- mteb/models/model_implementations/reasonir_model.py +65 -0
- mteb/models/model_implementations/repllama_models.py +15 -6
- mteb/models/model_implementations/rerankers_custom.py +13 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +24 -4
- mteb/models/model_implementations/richinfoai_models.py +1 -0
- mteb/models/model_implementations/ru_sentence_models.py +20 -0
- mteb/models/model_implementations/ruri_models.py +10 -0
- mteb/models/model_implementations/salesforce_models.py +10 -1
- mteb/models/model_implementations/samilpwc_models.py +1 -0
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -0
- mteb/models/model_implementations/searchmap_models.py +1 -0
- mteb/models/model_implementations/seed_1_6_embedding_models.py +5 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +6 -2
- mteb/models/model_implementations/seed_models.py +2 -1
- mteb/models/model_implementations/sentence_transformers_models.py +18 -0
- mteb/models/model_implementations/shuu_model.py +1 -0
- mteb/models/model_implementations/siglip_models.py +19 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/sonar_models.py +2 -1
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -0
- mteb/models/model_implementations/stella_models.py +6 -0
- mteb/models/model_implementations/tarka_models.py +2 -0
- mteb/models/model_implementations/text2vec_models.py +3 -0
- mteb/models/model_implementations/ua_sentence_models.py +1 -0
- mteb/models/model_implementations/uae_models.py +10 -4
- mteb/models/model_implementations/vdr_models.py +8 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -0
- mteb/models/model_implementations/vista_models.py +11 -4
- mteb/models/model_implementations/vlm2vec_models.py +11 -4
- mteb/models/model_implementations/voyage_models.py +25 -4
- mteb/models/model_implementations/voyage_v.py +11 -6
- mteb/models/model_implementations/xyz_models.py +1 -0
- mteb/models/model_implementations/youtu_models.py +1 -0
- mteb/models/model_implementations/yuan_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +2 -1
- mteb/models/model_meta.py +47 -9
- mteb/models/models_protocols.py +19 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +19 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +15 -9
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/clustering/nob/snl_clustering.py +7 -2
- mteb/tasks/clustering/nob/vg_clustering.py +7 -2
- mteb/tasks/retrieval/eng/__init__.py +42 -0
- mteb/tasks/retrieval/eng/bright_retrieval.py +9 -1
- mteb/tasks/retrieval/eng/bright_v1_1_retrieval.py +968 -0
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/METADATA +1 -1
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/RECORD +238 -217
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/WHEEL +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.2.dist-info → mteb-2.7.4.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,7 @@ greennode_embedding_large_vn_v1 = ModelMeta(
|
|
|
16
16
|
loader=sentence_transformers_loader,
|
|
17
17
|
open_weights=True,
|
|
18
18
|
n_parameters=568_000_000,
|
|
19
|
+
n_embedding_parameters=256_002_048,
|
|
19
20
|
memory_usage_mb=2167,
|
|
20
21
|
embed_dim=1024,
|
|
21
22
|
license="cc-by-4.0",
|
|
@@ -41,6 +42,7 @@ greennode_embedding_large_vn_mixed_v1 = ModelMeta(
|
|
|
41
42
|
loader=sentence_transformers_loader,
|
|
42
43
|
open_weights=True,
|
|
43
44
|
n_parameters=568_000_000,
|
|
45
|
+
n_embedding_parameters=256_002_048,
|
|
44
46
|
memory_usage_mb=2167,
|
|
45
47
|
embed_dim=1024,
|
|
46
48
|
license="cc-by-4.0",
|
|
@@ -66,6 +68,7 @@ aiteamvn_vietnamese_embeddings = ModelMeta(
|
|
|
66
68
|
loader=sentence_transformers_loader,
|
|
67
69
|
open_weights=True,
|
|
68
70
|
n_parameters=568_000_000,
|
|
71
|
+
n_embedding_parameters=256_002_048,
|
|
69
72
|
memory_usage_mb=2166,
|
|
70
73
|
embed_dim=1024,
|
|
71
74
|
license="cc-by-4.0",
|
|
@@ -98,6 +101,7 @@ hiieu_halong_embedding = ModelMeta(
|
|
|
98
101
|
use_instructions=False,
|
|
99
102
|
open_weights=True,
|
|
100
103
|
n_parameters=278_000_000,
|
|
104
|
+
n_embedding_parameters=192_001_536,
|
|
101
105
|
memory_usage_mb=1061,
|
|
102
106
|
embed_dim=768,
|
|
103
107
|
license="apache-2.0",
|
|
@@ -129,6 +133,7 @@ sup_simcse_vietnamese_phobert_base_ = ModelMeta(
|
|
|
129
133
|
use_instructions=False,
|
|
130
134
|
open_weights=True,
|
|
131
135
|
n_parameters=135_000_000,
|
|
136
|
+
n_embedding_parameters=49_152_768,
|
|
132
137
|
memory_usage_mb=517,
|
|
133
138
|
max_tokens=256,
|
|
134
139
|
embed_dim=768,
|
|
@@ -167,6 +172,7 @@ bkai_foundation_models_vietnamese_bi_encoder = ModelMeta(
|
|
|
167
172
|
use_instructions=False,
|
|
168
173
|
open_weights=True,
|
|
169
174
|
n_parameters=135_000_000,
|
|
175
|
+
n_embedding_parameters=49_152_768,
|
|
170
176
|
memory_usage_mb=515,
|
|
171
177
|
max_tokens=256,
|
|
172
178
|
embed_dim=768,
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
8
|
from mteb._requires_package import requires_image_dependencies
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from torch.utils.data import DataLoader
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
12
17
|
|
|
13
18
|
VISTA_CITATION = """@article{zhou2024vista,
|
|
14
19
|
title={VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval},
|
|
@@ -253,6 +258,7 @@ visualized_bge_base = ModelMeta(
|
|
|
253
258
|
release_date="2024-06-06",
|
|
254
259
|
modalities=["image", "text"],
|
|
255
260
|
n_parameters=196_000_000,
|
|
261
|
+
n_embedding_parameters=None,
|
|
256
262
|
memory_usage_mb=1631,
|
|
257
263
|
max_tokens=512,
|
|
258
264
|
embed_dim=768,
|
|
@@ -281,6 +287,7 @@ visualized_bge_m3 = ModelMeta(
|
|
|
281
287
|
release_date="2024-06-06",
|
|
282
288
|
modalities=["image", "text"],
|
|
283
289
|
n_parameters=872_909_505,
|
|
290
|
+
n_embedding_parameters=None,
|
|
284
291
|
memory_usage_mb=4263,
|
|
285
292
|
max_tokens=8192,
|
|
286
293
|
embed_dim=1024,
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
from tqdm.auto import tqdm
|
|
7
8
|
|
|
8
9
|
from mteb._requires_package import (
|
|
@@ -10,10 +11,14 @@ from mteb._requires_package import (
|
|
|
10
11
|
requires_package,
|
|
11
12
|
suggest_package,
|
|
12
13
|
)
|
|
13
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
14
14
|
from mteb.models.abs_encoder import AbsEncoder
|
|
15
15
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
16
|
-
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from torch.utils.data import DataLoader
|
|
19
|
+
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
21
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
17
22
|
|
|
18
23
|
logger = logging.getLogger(__name__)
|
|
19
24
|
|
|
@@ -275,6 +280,7 @@ vlm2vec_lora = ModelMeta(
|
|
|
275
280
|
release_date="2024-10-08",
|
|
276
281
|
modalities=["image", "text"],
|
|
277
282
|
n_parameters=None,
|
|
283
|
+
n_embedding_parameters=None,
|
|
278
284
|
memory_usage_mb=None,
|
|
279
285
|
max_tokens=131072,
|
|
280
286
|
embed_dim=3072,
|
|
@@ -299,6 +305,7 @@ vlm2vec_full = ModelMeta(
|
|
|
299
305
|
release_date="2024-10-08",
|
|
300
306
|
modalities=["image", "text"],
|
|
301
307
|
n_parameters=4_150_000_000,
|
|
308
|
+
n_embedding_parameters=None,
|
|
302
309
|
memory_usage_mb=7909,
|
|
303
310
|
max_tokens=131072,
|
|
304
311
|
embed_dim=3072,
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import time
|
|
2
4
|
from functools import wraps
|
|
3
|
-
from typing import Any, Literal
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
8
|
from tqdm.auto import tqdm
|
|
8
9
|
|
|
9
10
|
from mteb._requires_package import requires_package
|
|
10
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
11
|
from mteb.models.abs_encoder import AbsEncoder
|
|
12
12
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
|
-
from mteb.types import
|
|
13
|
+
from mteb.types import PromptType
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from torch.utils.data import DataLoader
|
|
17
|
+
|
|
18
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
|
+
from mteb.types import Array, BatchedInput
|
|
14
20
|
|
|
15
21
|
VOYAGE_TRAINING_DATA = set(
|
|
16
22
|
# Self-reported (message from VoyageAI member)
|
|
@@ -302,6 +308,7 @@ voyage_3_large = ModelMeta(
|
|
|
302
308
|
embed_dim=1024,
|
|
303
309
|
open_weights=False,
|
|
304
310
|
n_parameters=None,
|
|
311
|
+
n_embedding_parameters=None,
|
|
305
312
|
memory_usage_mb=None,
|
|
306
313
|
license=None,
|
|
307
314
|
reference="https://blog.voyageai.com/2025/01/07/voyage-3-large/",
|
|
@@ -330,6 +337,7 @@ voyage_3_5 = ModelMeta(
|
|
|
330
337
|
embed_dim=1024,
|
|
331
338
|
open_weights=False,
|
|
332
339
|
n_parameters=None,
|
|
340
|
+
n_embedding_parameters=None,
|
|
333
341
|
memory_usage_mb=None,
|
|
334
342
|
license=None,
|
|
335
343
|
reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
|
|
@@ -357,6 +365,7 @@ voyage_3_5_int8 = ModelMeta(
|
|
|
357
365
|
embed_dim=1024,
|
|
358
366
|
open_weights=False,
|
|
359
367
|
n_parameters=None,
|
|
368
|
+
n_embedding_parameters=None,
|
|
360
369
|
memory_usage_mb=None,
|
|
361
370
|
license=None,
|
|
362
371
|
reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
|
|
@@ -384,6 +393,7 @@ voyage_3_5_binary = ModelMeta(
|
|
|
384
393
|
embed_dim=1024, # Same as original after unpacking from bits
|
|
385
394
|
open_weights=False,
|
|
386
395
|
n_parameters=None,
|
|
396
|
+
n_embedding_parameters=None,
|
|
387
397
|
memory_usage_mb=None,
|
|
388
398
|
license=None,
|
|
389
399
|
reference="https://blog.voyageai.com/2025/05/20/voyage-3-5/",
|
|
@@ -411,6 +421,7 @@ voyage_large_2_instruct = ModelMeta(
|
|
|
411
421
|
embed_dim=1024,
|
|
412
422
|
open_weights=False,
|
|
413
423
|
n_parameters=None,
|
|
424
|
+
n_embedding_parameters=None,
|
|
414
425
|
memory_usage_mb=None,
|
|
415
426
|
license=None,
|
|
416
427
|
reference="https://blog.voyageai.com/2024/05/05/voyage-large-2-instruct-instruction-tuned-and-rank-1-on-mteb/",
|
|
@@ -437,6 +448,7 @@ voyage_finance_2 = ModelMeta(
|
|
|
437
448
|
embed_dim=1024,
|
|
438
449
|
open_weights=False,
|
|
439
450
|
n_parameters=None,
|
|
451
|
+
n_embedding_parameters=None,
|
|
440
452
|
memory_usage_mb=None,
|
|
441
453
|
license=None,
|
|
442
454
|
reference="https://blog.voyageai.com/2024/06/03/domain-specific-embeddings-finance-edition-voyage-finance-2/",
|
|
@@ -463,6 +475,7 @@ voyage_law_2 = ModelMeta(
|
|
|
463
475
|
embed_dim=1024,
|
|
464
476
|
open_weights=False,
|
|
465
477
|
n_parameters=None,
|
|
478
|
+
n_embedding_parameters=None,
|
|
466
479
|
memory_usage_mb=None,
|
|
467
480
|
license=None,
|
|
468
481
|
reference="https://blog.voyageai.com/2024/04/15/domain-specific-embeddings-and-retrieval-legal-edition-voyage-law-2/",
|
|
@@ -489,6 +502,7 @@ voyage_code_2 = ModelMeta(
|
|
|
489
502
|
embed_dim=1536,
|
|
490
503
|
open_weights=False,
|
|
491
504
|
n_parameters=None,
|
|
505
|
+
n_embedding_parameters=None,
|
|
492
506
|
memory_usage_mb=None,
|
|
493
507
|
license=None,
|
|
494
508
|
reference="https://blog.voyageai.com/2024/01/23/voyage-code-2-elevate-your-code-retrieval/",
|
|
@@ -515,6 +529,7 @@ voyage_code_3 = ModelMeta(
|
|
|
515
529
|
embed_dim=1024,
|
|
516
530
|
open_weights=False,
|
|
517
531
|
n_parameters=None,
|
|
532
|
+
n_embedding_parameters=None,
|
|
518
533
|
memory_usage_mb=None,
|
|
519
534
|
license=None,
|
|
520
535
|
reference="https://blog.voyageai.com/2024/12/04/voyage-code-3/",
|
|
@@ -542,6 +557,7 @@ voyage_large_2 = ModelMeta(
|
|
|
542
557
|
embed_dim=1536,
|
|
543
558
|
open_weights=False,
|
|
544
559
|
n_parameters=None,
|
|
560
|
+
n_embedding_parameters=None,
|
|
545
561
|
memory_usage_mb=None,
|
|
546
562
|
license=None,
|
|
547
563
|
reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/",
|
|
@@ -568,6 +584,7 @@ voyage_2 = ModelMeta(
|
|
|
568
584
|
embed_dim=1024,
|
|
569
585
|
open_weights=False,
|
|
570
586
|
n_parameters=None,
|
|
587
|
+
n_embedding_parameters=None,
|
|
571
588
|
memory_usage_mb=None,
|
|
572
589
|
license=None,
|
|
573
590
|
reference="https://blog.voyageai.com/2023/10/29/voyage-embeddings/",
|
|
@@ -593,6 +610,7 @@ voyage_multilingual_2 = ModelMeta(
|
|
|
593
610
|
embed_dim=1024,
|
|
594
611
|
open_weights=False,
|
|
595
612
|
n_parameters=None,
|
|
613
|
+
n_embedding_parameters=None,
|
|
596
614
|
memory_usage_mb=None,
|
|
597
615
|
license=None,
|
|
598
616
|
reference="https://blog.voyageai.com/2024/06/10/voyage-multilingual-2-multilingual-embedding-model/",
|
|
@@ -619,6 +637,7 @@ voyage_3 = ModelMeta(
|
|
|
619
637
|
embed_dim=1024,
|
|
620
638
|
open_weights=False,
|
|
621
639
|
n_parameters=None,
|
|
640
|
+
n_embedding_parameters=None,
|
|
622
641
|
memory_usage_mb=None,
|
|
623
642
|
license=None,
|
|
624
643
|
reference="https://blog.voyageai.com/2024/09/18/voyage-3/",
|
|
@@ -645,6 +664,7 @@ voyage_3_lite = ModelMeta(
|
|
|
645
664
|
embed_dim=512,
|
|
646
665
|
open_weights=False,
|
|
647
666
|
n_parameters=None,
|
|
667
|
+
n_embedding_parameters=None,
|
|
648
668
|
memory_usage_mb=None,
|
|
649
669
|
license=None,
|
|
650
670
|
reference="https://blog.voyageai.com/2024/09/18/voyage-3/",
|
|
@@ -673,6 +693,7 @@ voyage_3_exp = ModelMeta(
|
|
|
673
693
|
open_weights=False,
|
|
674
694
|
# from their card https://huggingface.co/voyageai/voyage-3-m-exp#model-information
|
|
675
695
|
n_parameters=int(6918 * 1e6),
|
|
696
|
+
n_embedding_parameters=None,
|
|
676
697
|
memory_usage_mb=None,
|
|
677
698
|
license=None,
|
|
678
699
|
reference="https://huggingface.co/voyageai/voyage-3-m-exp",
|
|
@@ -4,17 +4,19 @@ import logging
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Literal
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
7
|
from tqdm.auto import tqdm
|
|
9
8
|
|
|
10
9
|
from mteb._requires_package import requires_image_dependencies, requires_package
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
10
|
from mteb.models.abs_encoder import AbsEncoder
|
|
13
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
14
|
-
from mteb.types import
|
|
12
|
+
from mteb.types import PromptType
|
|
15
13
|
|
|
16
14
|
if TYPE_CHECKING:
|
|
17
15
|
from PIL import Image
|
|
16
|
+
from torch.utils.data import DataLoader
|
|
17
|
+
|
|
18
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
|
+
from mteb.types import Array, BatchedInput
|
|
18
20
|
|
|
19
21
|
logger = logging.getLogger(__name__)
|
|
20
22
|
|
|
@@ -27,6 +29,8 @@ def _downsample_image(
|
|
|
27
29
|
Returns:
|
|
28
30
|
The downsampled image.
|
|
29
31
|
"""
|
|
32
|
+
from PIL.Image import Resampling
|
|
33
|
+
|
|
30
34
|
width, height = image.size
|
|
31
35
|
pixels = width * height
|
|
32
36
|
|
|
@@ -42,15 +46,15 @@ def _downsample_image(
|
|
|
42
46
|
logger.info(
|
|
43
47
|
f"Downsampling image from {width}x{height} to {new_width}x{new_height}"
|
|
44
48
|
)
|
|
45
|
-
return image.resize(new_size,
|
|
49
|
+
return image.resize(new_size, Resampling.LANCZOS)
|
|
46
50
|
if width > height:
|
|
47
51
|
if width > 10000:
|
|
48
52
|
logger.error("Processing extremely wide images.")
|
|
49
|
-
return image.resize((10000, height),
|
|
53
|
+
return image.resize((10000, height), Resampling.LANCZOS)
|
|
50
54
|
else:
|
|
51
55
|
if height > 10000:
|
|
52
56
|
logger.error("Processing extremely high images.")
|
|
53
|
-
return image.resize((width, 10000),
|
|
57
|
+
return image.resize((width, 10000), Resampling.LANCZOS)
|
|
54
58
|
return image
|
|
55
59
|
|
|
56
60
|
|
|
@@ -211,6 +215,7 @@ voyage_v = ModelMeta(
|
|
|
211
215
|
revision="1",
|
|
212
216
|
release_date="2024-11-10",
|
|
213
217
|
n_parameters=None,
|
|
218
|
+
n_embedding_parameters=None,
|
|
214
219
|
memory_usage_mb=None,
|
|
215
220
|
max_tokens=32768,
|
|
216
221
|
embed_dim=1024,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
2
|
from mteb.models.model_meta import ModelMeta
|
|
3
|
-
from mteb.
|
|
3
|
+
from mteb.types import PromptType
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def instruction_template(
|
|
@@ -43,6 +43,7 @@ yuan_embedding_2_en = ModelMeta(
|
|
|
43
43
|
revision="b2fd15da3bcae3473c8529593825c15068f09fce",
|
|
44
44
|
release_date="2025-11-27",
|
|
45
45
|
n_parameters=595776512,
|
|
46
|
+
n_embedding_parameters=None,
|
|
46
47
|
memory_usage_mb=2272,
|
|
47
48
|
embed_dim=1024,
|
|
48
49
|
max_tokens=2048,
|
mteb/models/model_meta.py
CHANGED
|
@@ -3,17 +3,16 @@ from __future__ import annotations
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
-
from collections.abc import Callable
|
|
6
|
+
from collections.abc import Callable
|
|
7
7
|
from dataclasses import field
|
|
8
8
|
from enum import Enum
|
|
9
9
|
from functools import partial
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
12
12
|
|
|
13
|
+
import numpy as np
|
|
13
14
|
from huggingface_hub import (
|
|
14
|
-
GitCommitInfo,
|
|
15
15
|
ModelCard,
|
|
16
|
-
ModelCardData,
|
|
17
16
|
get_safetensors_metadata,
|
|
18
17
|
hf_hub_download,
|
|
19
18
|
list_repo_commits,
|
|
@@ -29,18 +28,27 @@ from huggingface_hub.errors import (
|
|
|
29
28
|
SafetensorsParsingError,
|
|
30
29
|
)
|
|
31
30
|
from pydantic import BaseModel, ConfigDict, field_validator, model_validator
|
|
31
|
+
from sentence_transformers.models import Transformer
|
|
32
|
+
from torch import nn
|
|
32
33
|
from transformers import AutoConfig
|
|
33
|
-
from typing_extensions import Self
|
|
34
34
|
|
|
35
35
|
from mteb._helpful_enum import HelpfulStrEnum
|
|
36
36
|
from mteb.languages import check_language_code
|
|
37
|
-
from mteb.models.models_protocols import
|
|
37
|
+
from mteb.models.models_protocols import MTEBModels
|
|
38
38
|
from mteb.types import ISOLanguageScript, Licenses, Modalities, StrDate, StrURL
|
|
39
39
|
|
|
40
40
|
if TYPE_CHECKING:
|
|
41
|
+
from collections.abc import Sequence
|
|
42
|
+
|
|
43
|
+
from huggingface_hub import (
|
|
44
|
+
GitCommitInfo,
|
|
45
|
+
ModelCardData,
|
|
46
|
+
)
|
|
41
47
|
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
48
|
+
from typing_extensions import Self
|
|
42
49
|
|
|
43
50
|
from mteb.abstasks import AbsTask
|
|
51
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
44
52
|
|
|
45
53
|
|
|
46
54
|
logger = logging.getLogger(__name__)
|
|
@@ -94,8 +102,9 @@ class ModelMeta(BaseModel):
|
|
|
94
102
|
loader: The function that loads the model. If None it assumes that the model is not implemented.
|
|
95
103
|
loader_kwargs: The keyword arguments to pass to the loader function.
|
|
96
104
|
name: The name of the model, ideally the name on huggingface. It should be in the format "organization/model_name".
|
|
97
|
-
n_parameters: The number of parameters in the model, e.g. 7_000_000 for a 7M parameter model. Can be
|
|
98
|
-
|
|
105
|
+
n_parameters: The total number of parameters in the model, e.g. `7_000_000` for a 7M parameter model. Can be none in case the number of parameters is unknown.
|
|
106
|
+
n_embedding_parameters: The number of parameters used for the embedding layer. Can be None if the number of embedding parameters is not known (e.g. for proprietary models).
|
|
107
|
+
n_active_parameters_override: The number of active parameters used bu model. Should be used **only** for Mixture of Experts models.
|
|
99
108
|
memory_usage_mb: The memory usage of the model in MB. Can be None if the memory usage is not known (e.g. for proprietary models). To calculate it use the `calculate_memory_usage_mb` method.
|
|
100
109
|
max_tokens: The maximum number of tokens the model can handle. Can be None if the maximum number of tokens is not known (e.g. for proprietary
|
|
101
110
|
models).
|
|
@@ -134,6 +143,8 @@ class ModelMeta(BaseModel):
|
|
|
134
143
|
release_date: StrDate | None
|
|
135
144
|
languages: list[ISOLanguageScript] | None
|
|
136
145
|
n_parameters: int | None
|
|
146
|
+
n_active_parameters_override: int | None = None
|
|
147
|
+
n_embedding_parameters: int | None = None
|
|
137
148
|
memory_usage_mb: float | None
|
|
138
149
|
max_tokens: float | None
|
|
139
150
|
embed_dim: int | None
|
|
@@ -192,6 +203,16 @@ class ModelMeta(BaseModel):
|
|
|
192
203
|
"""
|
|
193
204
|
return "cross-encoder" in self.model_type
|
|
194
205
|
|
|
206
|
+
@property
|
|
207
|
+
def n_active_parameters(self):
|
|
208
|
+
"""Number of active parameters. Assumed to be `n_parameters - n_embedding_parameters`. Can be overwritten using `n_active_parameters_override` e.g. for MoE models."""
|
|
209
|
+
if self.n_active_parameters_override is not None:
|
|
210
|
+
return self.n_active_parameters_override
|
|
211
|
+
|
|
212
|
+
if self.n_parameters is not None and self.n_embedding_parameters is not None:
|
|
213
|
+
return self.n_parameters - self.n_embedding_parameters
|
|
214
|
+
return None
|
|
215
|
+
|
|
195
216
|
@field_validator("similarity_fn_name", mode="before")
|
|
196
217
|
@classmethod
|
|
197
218
|
def _validate_similarity_fn_name(cls, value: str) -> ScoringFunction | None:
|
|
@@ -384,6 +405,14 @@ class ModelMeta(BaseModel):
|
|
|
384
405
|
else model.model_card_data.base_model
|
|
385
406
|
)
|
|
386
407
|
meta = cls._from_hub(name, revision, compute_metadata)
|
|
408
|
+
try:
|
|
409
|
+
first = model[0]
|
|
410
|
+
|
|
411
|
+
if isinstance(first, Transformer):
|
|
412
|
+
emb = first.auto_model.get_input_embeddings()
|
|
413
|
+
meta.n_embedding_parameters = int(np.prod(emb.weight.shape))
|
|
414
|
+
except Exception as e:
|
|
415
|
+
logger.warning(f"Could not calculate embedding parameters for {name}: {e}")
|
|
387
416
|
meta.revision = model.model_card_data.base_model_revision or meta.revision
|
|
388
417
|
meta.max_tokens = model.max_seq_length
|
|
389
418
|
meta.embed_dim = model.get_sentence_embedding_dimension()
|
|
@@ -455,6 +484,15 @@ class ModelMeta(BaseModel):
|
|
|
455
484
|
from mteb.models import CrossEncoderWrapper
|
|
456
485
|
|
|
457
486
|
meta = cls._from_hub(model.model.name_or_path, revision, compute_metadata)
|
|
487
|
+
try:
|
|
488
|
+
emb = model.model.get_input_embeddings()
|
|
489
|
+
|
|
490
|
+
if isinstance(emb, nn.Embedding):
|
|
491
|
+
meta.n_embedding_parameters = int(np.prod(emb.weight.shape))
|
|
492
|
+
except Exception as e:
|
|
493
|
+
logger.warning(
|
|
494
|
+
f"Could not calculate embedding parameters for {model.model.name_or_path}: {e}"
|
|
495
|
+
)
|
|
458
496
|
meta.revision = model.config._commit_hash or meta.revision
|
|
459
497
|
meta.loader = CrossEncoderWrapper
|
|
460
498
|
meta.embed_dim = None
|
|
@@ -479,7 +517,7 @@ class ModelMeta(BaseModel):
|
|
|
479
517
|
if isinstance(tasks[0], str):
|
|
480
518
|
benchmark_datasets = set(tasks)
|
|
481
519
|
else:
|
|
482
|
-
tasks = cast(Sequence[
|
|
520
|
+
tasks = cast("Sequence[AbsTask]", tasks)
|
|
483
521
|
benchmark_datasets = set()
|
|
484
522
|
for task in tasks:
|
|
485
523
|
benchmark_datasets.add(task.metadata.name)
|
|
@@ -534,7 +572,7 @@ class ModelMeta(BaseModel):
|
|
|
534
572
|
if isinstance(tasks[0], str):
|
|
535
573
|
benchmark_datasets = set(tasks)
|
|
536
574
|
else:
|
|
537
|
-
tasks = cast(Sequence[
|
|
575
|
+
tasks = cast("Sequence[AbsTask]", tasks)
|
|
538
576
|
benchmark_datasets = {task.metadata.name for task in tasks}
|
|
539
577
|
overlap = training_datasets & benchmark_datasets
|
|
540
578
|
perc_overlap = 100 * (len(overlap) / len(benchmark_datasets))
|
mteb/models/models_protocols.py
CHANGED
|
@@ -1,22 +1,23 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from typing_extensions import Unpack
|
|
5
|
-
|
|
6
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
7
|
-
from mteb.types import (
|
|
8
|
-
Array,
|
|
9
|
-
BatchedInput,
|
|
10
|
-
CorpusDatasetType,
|
|
11
|
-
EncodeKwargs,
|
|
12
|
-
PromptType,
|
|
13
|
-
QueryDatasetType,
|
|
14
|
-
RetrievalOutputType,
|
|
15
|
-
TopRankedDocumentsType,
|
|
16
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
17
4
|
|
|
18
5
|
if TYPE_CHECKING:
|
|
6
|
+
from torch.utils.data import DataLoader
|
|
7
|
+
from typing_extensions import Unpack
|
|
8
|
+
|
|
9
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
10
|
from mteb.models.model_meta import ModelMeta
|
|
11
|
+
from mteb.types import (
|
|
12
|
+
Array,
|
|
13
|
+
BatchedInput,
|
|
14
|
+
CorpusDatasetType,
|
|
15
|
+
EncodeKwargs,
|
|
16
|
+
PromptType,
|
|
17
|
+
QueryDatasetType,
|
|
18
|
+
RetrievalOutputType,
|
|
19
|
+
TopRankedDocumentsType,
|
|
20
|
+
)
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
@runtime_checkable
|
|
@@ -72,7 +73,7 @@ class SearchProtocol(Protocol):
|
|
|
72
73
|
...
|
|
73
74
|
|
|
74
75
|
@property
|
|
75
|
-
def mteb_model_meta(self) ->
|
|
76
|
+
def mteb_model_meta(self) -> ModelMeta:
|
|
76
77
|
"""Metadata of the model"""
|
|
77
78
|
...
|
|
78
79
|
|
|
@@ -177,7 +178,7 @@ class EncoderProtocol(Protocol):
|
|
|
177
178
|
...
|
|
178
179
|
|
|
179
180
|
@property
|
|
180
|
-
def mteb_model_meta(self) ->
|
|
181
|
+
def mteb_model_meta(self) -> ModelMeta:
|
|
181
182
|
"""Metadata of the model"""
|
|
182
183
|
...
|
|
183
184
|
|
|
@@ -236,7 +237,7 @@ class CrossEncoderProtocol(Protocol):
|
|
|
236
237
|
...
|
|
237
238
|
|
|
238
239
|
@property
|
|
239
|
-
def mteb_model_meta(self) ->
|
|
240
|
+
def mteb_model_meta(self) -> ModelMeta:
|
|
240
241
|
"""Metadata of the model"""
|
|
241
242
|
...
|
|
242
243
|
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import Protocol
|
|
1
|
+
from __future__ import annotations
|
|
3
2
|
|
|
4
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING, Protocol
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
|
|
8
|
+
from mteb.types import Array, TopRankedDocumentsType
|
|
5
9
|
|
|
6
10
|
|
|
7
11
|
class IndexEncoderSearchProtocol(Protocol):
|
|
@@ -1,14 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import warnings
|
|
3
|
-
from
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import torch
|
|
7
9
|
|
|
8
10
|
from mteb._requires_package import requires_package
|
|
9
11
|
from mteb.models.model_meta import ScoringFunction
|
|
10
|
-
|
|
11
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
|
|
16
|
+
import faiss
|
|
17
|
+
|
|
18
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
19
|
+
from mteb.types import Array, TopRankedDocumentsType
|
|
20
|
+
|
|
12
21
|
|
|
13
22
|
logger = logging.getLogger(__name__)
|
|
14
23
|
|
|
@@ -33,7 +42,6 @@ class FaissSearchIndex:
|
|
|
33
42
|
install_instruction="pip install mteb[faiss-cpu]",
|
|
34
43
|
)
|
|
35
44
|
|
|
36
|
-
import faiss
|
|
37
45
|
from faiss import IndexFlatIP, IndexFlatL2
|
|
38
46
|
|
|
39
47
|
# https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
|
mteb/models/search_wrappers.py
CHANGED
|
@@ -1,28 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import heapq
|
|
2
4
|
import logging
|
|
3
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
|
|
5
7
|
import torch
|
|
6
8
|
from datasets import Dataset
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
9
|
|
|
9
10
|
from mteb._create_dataloaders import (
|
|
10
11
|
create_dataloader,
|
|
11
12
|
)
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.types import (
|
|
14
|
-
Array,
|
|
15
|
-
BatchedInput,
|
|
16
|
-
CorpusDatasetType,
|
|
17
|
-
EncodeKwargs,
|
|
18
14
|
PromptType,
|
|
19
|
-
QueryDatasetType,
|
|
20
|
-
RetrievalOutputType,
|
|
21
|
-
TopRankedDocumentsType,
|
|
22
15
|
)
|
|
23
16
|
|
|
24
|
-
|
|
25
|
-
from .
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from torch.utils.data import DataLoader
|
|
19
|
+
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
21
|
+
from mteb.types import (
|
|
22
|
+
Array,
|
|
23
|
+
BatchedInput,
|
|
24
|
+
CorpusDatasetType,
|
|
25
|
+
EncodeKwargs,
|
|
26
|
+
QueryDatasetType,
|
|
27
|
+
RetrievalOutputType,
|
|
28
|
+
TopRankedDocumentsType,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
from .models_protocols import CrossEncoderProtocol, EncoderProtocol
|
|
32
|
+
from .search_encoder_index.search_backend_protocol import IndexEncoderSearchProtocol
|
|
26
33
|
|
|
27
34
|
logger = logging.getLogger(__name__)
|
|
28
35
|
|