mteb 2.7.1__py3-none-any.whl → 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +16 -9
- mteb/_evaluators/any_sts_evaluator.py +10 -5
- mteb/_evaluators/clustering_evaluator.py +10 -4
- mteb/_evaluators/evaluator.py +9 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
- mteb/_evaluators/pair_classification_evaluator.py +10 -5
- mteb/_evaluators/retrieval_evaluator.py +19 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +14 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
- mteb/_evaluators/text/summarization_evaluator.py +8 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +17 -9
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +15 -6
- mteb/abstasks/clustering.py +17 -8
- mteb/abstasks/clustering_legacy.py +14 -6
- mteb/abstasks/image/image_text_pair_classification.py +17 -7
- mteb/abstasks/multilabel_classification.py +11 -5
- mteb/abstasks/pair_classification.py +19 -9
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +27 -16
- mteb/abstasks/retrieval_dataset_loaders.py +11 -8
- mteb/abstasks/sts.py +19 -10
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +14 -7
- mteb/abstasks/text/summarization.py +17 -7
- mteb/abstasks/zeroshot_classification.py +15 -7
- mteb/benchmarks/_create_table.py +13 -3
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +20 -14
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/evaluate.py +20 -18
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +7 -3
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +11 -4
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +9 -4
- mteb/models/model_implementations/bedrock_models.py +16 -6
- mteb/models/model_implementations/blip2_models.py +9 -4
- mteb/models/model_implementations/blip_models.py +9 -4
- mteb/models/model_implementations/bm25.py +15 -10
- mteb/models/model_implementations/bmretriever_models.py +6 -2
- mteb/models/model_implementations/cde_models.py +9 -5
- mteb/models/model_implementations/clip_models.py +9 -4
- mteb/models/model_implementations/cohere_models.py +10 -4
- mteb/models/model_implementations/cohere_v.py +9 -4
- mteb/models/model_implementations/colpali_models.py +4 -3
- mteb/models/model_implementations/colqwen_models.py +10 -31
- mteb/models/model_implementations/colsmol_models.py +1 -1
- mteb/models/model_implementations/conan_models.py +10 -4
- mteb/models/model_implementations/dino_models.py +9 -4
- mteb/models/model_implementations/e5_v.py +9 -4
- mteb/models/model_implementations/eagerworks_models.py +10 -4
- mteb/models/model_implementations/evaclip_models.py +9 -4
- mteb/models/model_implementations/gme_v_models.py +5 -3
- mteb/models/model_implementations/google_models.py +10 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +6 -5
- mteb/models/model_implementations/hinvec_models.py +5 -1
- mteb/models/model_implementations/jasper_models.py +12 -5
- mteb/models/model_implementations/jina_clip.py +9 -4
- mteb/models/model_implementations/jina_models.py +10 -5
- mteb/models/model_implementations/kalm_models.py +18 -12
- mteb/models/model_implementations/linq_models.py +6 -1
- mteb/models/model_implementations/listconranker.py +9 -4
- mteb/models/model_implementations/llm2clip_models.py +9 -4
- mteb/models/model_implementations/llm2vec_models.py +12 -6
- mteb/models/model_implementations/mcinext_models.py +5 -2
- mteb/models/model_implementations/mdbr_models.py +3 -1
- mteb/models/model_implementations/{mxbai_models.py → mixedbread_ai_models.py} +91 -0
- mteb/models/model_implementations/moco_models.py +9 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +10 -4
- mteb/models/model_implementations/no_instruct_sentence_models.py +12 -5
- mteb/models/model_implementations/nomic_models.py +10 -4
- mteb/models/model_implementations/nomic_models_vision.py +4 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +7 -3
- mteb/models/model_implementations/nvidia_models.py +12 -4
- mteb/models/model_implementations/octen_models.py +1 -1
- mteb/models/model_implementations/openai_models.py +9 -4
- mteb/models/model_implementations/openclip_models.py +9 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -4
- mteb/models/model_implementations/ops_moa_models.py +7 -2
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -6
- mteb/models/model_implementations/pylate_models.py +19 -13
- mteb/models/model_implementations/qwen3_models.py +8 -1
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/repllama_models.py +13 -6
- mteb/models/model_implementations/rerankers_custom.py +10 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +10 -4
- mteb/models/model_implementations/salesforce_models.py +7 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +4 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +5 -2
- mteb/models/model_implementations/seed_models.py +1 -1
- mteb/models/model_implementations/siglip_models.py +9 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/uae_models.py +9 -4
- mteb/models/model_implementations/vdr_models.py +7 -1
- mteb/models/model_implementations/vista_models.py +9 -4
- mteb/models/model_implementations/vlm2vec_models.py +9 -4
- mteb/models/model_implementations/voyage_models.py +10 -4
- mteb/models/model_implementations/voyage_v.py +10 -6
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +12 -7
- mteb/models/models_protocols.py +19 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +19 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +41 -10
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/clustering/nob/snl_clustering.py +7 -2
- mteb/tasks/clustering/nob/vg_clustering.py +7 -2
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/METADATA +1 -1
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/RECORD +155 -154
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/WHEEL +0 -0
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/top_level.txt +0 -0
mteb/models/abs_encoder.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import warnings
|
|
3
5
|
from abc import ABC, abstractmethod
|
|
4
|
-
from
|
|
5
|
-
from typing import Any, Literal, cast, get_args, overload
|
|
6
|
-
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
|
-
from typing_extensions import Unpack
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, cast, get_args, overload
|
|
9
7
|
|
|
10
8
|
import mteb
|
|
11
|
-
from mteb.abstasks.task_metadata import
|
|
9
|
+
from mteb.abstasks.task_metadata import TaskType
|
|
12
10
|
from mteb.similarity_functions import (
|
|
13
11
|
cos_sim,
|
|
14
12
|
dot_score,
|
|
@@ -18,13 +16,25 @@ from mteb.similarity_functions import (
|
|
|
18
16
|
pairwise_max_sim,
|
|
19
17
|
)
|
|
20
18
|
from mteb.types import (
|
|
21
|
-
Array,
|
|
22
|
-
BatchedInput,
|
|
23
|
-
EncodeKwargs,
|
|
24
19
|
PromptType,
|
|
25
20
|
)
|
|
26
21
|
|
|
27
|
-
from .model_meta import
|
|
22
|
+
from .model_meta import ScoringFunction
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import Callable, Sequence
|
|
26
|
+
|
|
27
|
+
from torch.utils.data import DataLoader
|
|
28
|
+
from typing_extensions import Unpack
|
|
29
|
+
|
|
30
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
31
|
+
from mteb.types import (
|
|
32
|
+
Array,
|
|
33
|
+
BatchedInput,
|
|
34
|
+
EncodeKwargs,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
from .model_meta import ModelMeta
|
|
28
38
|
|
|
29
39
|
logger = logging.getLogger(__name__)
|
|
30
40
|
|
|
@@ -314,7 +324,7 @@ class AbsEncoder(ABC):
|
|
|
314
324
|
):
|
|
315
325
|
arr = self.model.similarity(embeddings1, embeddings2)
|
|
316
326
|
# We assume that the model returns an Array-like object:
|
|
317
|
-
arr = cast(Array, arr)
|
|
327
|
+
arr = cast("Array", arr)
|
|
318
328
|
return arr
|
|
319
329
|
return cos_sim(embeddings1, embeddings2)
|
|
320
330
|
if self.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:
|
|
@@ -352,7 +362,7 @@ class AbsEncoder(ABC):
|
|
|
352
362
|
):
|
|
353
363
|
arr = self.model.similarity_pairwise(embeddings1, embeddings2)
|
|
354
364
|
# We assume that the model returns an Array-like object:
|
|
355
|
-
arr = cast(Array, arr)
|
|
365
|
+
arr = cast("Array", arr)
|
|
356
366
|
return arr
|
|
357
367
|
return pairwise_cos_sim(embeddings1, embeddings2)
|
|
358
368
|
if self.mteb_model_meta.similarity_fn_name is ScoringFunction.COSINE:
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, Protocol, runtime_checkable
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
@runtime_checkable
|
|
@@ -1,6 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Mapping
|
|
8
|
+
|
|
9
|
+
from PIL import Image
|
|
4
10
|
|
|
5
11
|
|
|
6
12
|
def _hash_item(item: Mapping[str, Any]) -> str:
|
|
@@ -10,8 +16,6 @@ def _hash_item(item: Mapping[str, Any]) -> str:
|
|
|
10
16
|
item_hash = hashlib.sha256(item_text.encode()).hexdigest()
|
|
11
17
|
|
|
12
18
|
if "image" in item:
|
|
13
|
-
from PIL import Image
|
|
14
|
-
|
|
15
19
|
image: Image.Image = item["image"]
|
|
16
20
|
item_hash += hashlib.sha256(image.tobytes()).hexdigest()
|
|
17
21
|
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
import warnings
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Any
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
6
8
|
|
|
7
9
|
import numpy as np
|
|
8
10
|
|
|
9
11
|
from mteb._requires_package import requires_package
|
|
10
|
-
from mteb.types import BatchedInput
|
|
11
12
|
|
|
12
13
|
from ._hash_utils import _hash_item
|
|
13
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import faiss
|
|
17
|
+
|
|
18
|
+
from mteb.types import BatchedInput
|
|
19
|
+
|
|
14
20
|
logger = logging.getLogger(__name__)
|
|
15
21
|
|
|
16
22
|
|
|
@@ -24,7 +30,6 @@ class FaissCache:
|
|
|
24
30
|
"FAISS-based vector cache",
|
|
25
31
|
install_instruction="pip install mteb[faiss-cpu]",
|
|
26
32
|
)
|
|
27
|
-
import faiss
|
|
28
33
|
|
|
29
34
|
self.directory = Path(directory)
|
|
30
35
|
self.directory.mkdir(parents=True, exist_ok=True)
|
|
@@ -1,21 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
import torch
|
|
7
9
|
from datasets import Dataset
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
10
|
|
|
10
11
|
from mteb._create_dataloaders import create_dataloader
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
|
-
from mteb.models.cache_wrappers.cache_backend_protocol import (
|
|
13
|
-
CacheBackendProtocol,
|
|
14
|
-
)
|
|
15
12
|
from mteb.models.cache_wrappers.cache_backends.numpy_cache import NumpyCache
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
from
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from torch.utils.data import DataLoader
|
|
16
|
+
|
|
17
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
18
|
+
from mteb.models.cache_wrappers.cache_backend_protocol import (
|
|
19
|
+
CacheBackendProtocol,
|
|
20
|
+
)
|
|
21
|
+
from mteb.models.model_meta import ModelMeta
|
|
22
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
23
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
19
24
|
|
|
20
25
|
logger = logging.getLogger(__name__)
|
|
21
26
|
|
mteb/models/get_model_meta.py
CHANGED
|
@@ -1,15 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import difflib
|
|
2
4
|
import logging
|
|
3
|
-
from
|
|
4
|
-
from typing import Any
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
5
6
|
|
|
6
|
-
from mteb.abstasks import AbsTask
|
|
7
7
|
from mteb.models import (
|
|
8
8
|
ModelMeta,
|
|
9
|
-
MTEBModels,
|
|
10
9
|
)
|
|
11
10
|
from mteb.models.model_implementations import MODEL_REGISTRY
|
|
12
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Iterable
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks import AbsTask
|
|
16
|
+
from mteb.models import (
|
|
17
|
+
MTEBModels,
|
|
18
|
+
)
|
|
19
|
+
|
|
13
20
|
logger = logging.getLogger(__name__)
|
|
14
21
|
|
|
15
22
|
|
mteb/models/instruct_wrapper.py
CHANGED
|
@@ -1,16 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import torch
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
7
|
|
|
8
8
|
from mteb._requires_package import requires_package
|
|
9
|
-
from mteb.
|
|
10
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
9
|
+
from mteb.types import PromptType
|
|
11
10
|
|
|
12
11
|
from .abs_encoder import AbsEncoder
|
|
13
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
|
|
16
|
+
from torch.utils.data import DataLoader
|
|
17
|
+
|
|
18
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
|
+
from mteb.types import Array, BatchedInput
|
|
20
|
+
|
|
21
|
+
|
|
14
22
|
logger = logging.getLogger(__name__)
|
|
15
23
|
|
|
16
24
|
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.abs_encoder import AbsEncoder
|
|
9
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
class ALIGNModel(AbsEncoder):
|
|
@@ -1,20 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
import re
|
|
4
|
-
from typing import Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
5
7
|
|
|
6
8
|
import numpy as np
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
9
|
from tqdm.auto import tqdm
|
|
9
10
|
|
|
10
11
|
from mteb._requires_package import requires_package
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
12
|
from mteb.models.abs_encoder import AbsEncoder
|
|
13
13
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
14
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
15
14
|
|
|
16
|
-
from .cohere_models import
|
|
17
|
-
|
|
15
|
+
from .cohere_models import (
|
|
16
|
+
model_prompts as cohere_model_prompts,
|
|
17
|
+
)
|
|
18
|
+
from .cohere_models import (
|
|
19
|
+
supported_languages as cohere_supported_languages,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from torch.utils.data import DataLoader
|
|
24
|
+
|
|
25
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
26
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
27
|
+
|
|
18
28
|
|
|
19
29
|
logger = logging.getLogger(__name__)
|
|
20
30
|
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
8
|
from mteb._requires_package import requires_package
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from torch.utils.data import DataLoader
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
12
17
|
|
|
13
18
|
BLIP2_CITATION = """@inproceedings{li2023blip2,
|
|
14
19
|
title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
6
|
from torch.nn.functional import normalize
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
from tqdm.auto import tqdm
|
|
7
8
|
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from torch.utils.data import DataLoader
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
12
17
|
|
|
13
18
|
BLIP_CITATION = """@misc{https://doi.org/10.48550/arxiv.2201.12086,
|
|
14
19
|
doi = {10.48550/ARXIV.2201.12086},
|
|
@@ -1,18 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
2
5
|
|
|
3
6
|
from mteb._create_dataloaders import _create_text_queries_dataloader
|
|
4
7
|
from mteb._requires_package import requires_package
|
|
5
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
6
8
|
from mteb.models.model_meta import ModelMeta
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
|
+
from mteb.models.models_protocols import SearchProtocol
|
|
13
|
+
from mteb.types import (
|
|
14
|
+
CorpusDatasetType,
|
|
15
|
+
EncodeKwargs,
|
|
16
|
+
InstructionDatasetType,
|
|
17
|
+
QueryDatasetType,
|
|
18
|
+
RetrievalOutputType,
|
|
19
|
+
TopRankedDocumentsType,
|
|
20
|
+
)
|
|
16
21
|
|
|
17
22
|
logger = logging.getLogger(__name__)
|
|
18
23
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
3
4
|
|
|
4
5
|
import torch
|
|
5
6
|
from sentence_transformers import SentenceTransformer
|
|
@@ -9,6 +10,9 @@ from mteb.models import ModelMeta
|
|
|
9
10
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
10
11
|
from mteb.types import PromptType
|
|
11
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
|
|
12
16
|
|
|
13
17
|
def instruction_template(
|
|
14
18
|
instruction: str, prompt_type: PromptType | None = None
|
|
@@ -1,27 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from collections.abc import Sequence
|
|
3
4
|
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import torch
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
8
|
|
|
9
9
|
import mteb
|
|
10
10
|
from mteb._create_dataloaders import _corpus_to_dict
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
13
|
-
from mteb.models.models_protocols import PromptType
|
|
14
12
|
from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
|
|
15
|
-
from mteb.types import
|
|
13
|
+
from mteb.types import PromptType
|
|
16
14
|
|
|
17
15
|
from .bge_models import bge_full_data
|
|
18
16
|
|
|
19
17
|
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Sequence
|
|
19
|
+
|
|
20
|
+
from torch.utils.data import DataLoader
|
|
21
|
+
|
|
20
22
|
from mteb.abstasks import (
|
|
21
23
|
AbsTaskClassification,
|
|
22
24
|
AbsTaskRetrieval,
|
|
23
25
|
AbsTaskSummarization,
|
|
24
26
|
)
|
|
27
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
28
|
+
from mteb.types import Array, BatchedInput
|
|
25
29
|
logger = logging.getLogger(__name__)
|
|
26
30
|
|
|
27
31
|
CDE_CITATION = """@misc{morris2024contextualdocumentembeddings,
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.abs_encoder import AbsEncoder
|
|
9
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
class CLIPModel(AbsEncoder):
|
|
@@ -1,18 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import time
|
|
3
5
|
from functools import wraps
|
|
4
|
-
from typing import Any, Literal, get_args
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, get_args
|
|
5
7
|
|
|
6
8
|
import numpy as np
|
|
7
9
|
import torch
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
10
|
from tqdm.auto import tqdm
|
|
10
11
|
|
|
11
12
|
from mteb._requires_package import requires_package
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models.abs_encoder import AbsEncoder
|
|
14
14
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
15
|
-
from mteb.types import
|
|
15
|
+
from mteb.types import PromptType
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from torch.utils.data import DataLoader
|
|
19
|
+
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
21
|
+
from mteb.types import Array, BatchedInput
|
|
16
22
|
|
|
17
23
|
logger = logging.getLogger(__name__)
|
|
18
24
|
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import base64
|
|
2
4
|
import io
|
|
3
5
|
import os
|
|
4
6
|
import time
|
|
5
|
-
from typing import Any, Literal, get_args
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal, get_args
|
|
6
8
|
|
|
7
9
|
import torch
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
10
|
from tqdm.auto import tqdm
|
|
10
11
|
|
|
11
12
|
from mteb._requires_package import requires_image_dependencies, requires_package
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models import ModelMeta
|
|
14
14
|
from mteb.models.abs_encoder import AbsEncoder
|
|
15
15
|
from mteb.models.model_implementations.cohere_models import (
|
|
@@ -18,7 +18,12 @@ from mteb.models.model_implementations.cohere_models import (
|
|
|
18
18
|
retry_with_rate_limit,
|
|
19
19
|
)
|
|
20
20
|
from mteb.models.model_meta import ScoringFunction
|
|
21
|
-
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from torch.utils.data import DataLoader
|
|
24
|
+
|
|
25
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
26
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
22
27
|
|
|
23
28
|
|
|
24
29
|
def _post_process_embeddings(
|
|
@@ -4,20 +4,21 @@ import logging
|
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
6
|
import torch
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
7
|
from tqdm.auto import tqdm
|
|
9
8
|
|
|
10
9
|
from mteb._requires_package import (
|
|
11
10
|
requires_image_dependencies,
|
|
12
11
|
requires_package,
|
|
13
12
|
)
|
|
14
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
13
|
from mteb.models.abs_encoder import AbsEncoder
|
|
16
14
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
17
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
18
15
|
|
|
19
16
|
if TYPE_CHECKING:
|
|
20
17
|
from PIL import Image
|
|
18
|
+
from torch.utils.data import DataLoader
|
|
19
|
+
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
21
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
21
22
|
|
|
22
23
|
logger = logging.getLogger(__name__)
|
|
23
24
|
|
|
@@ -1,18 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
from tqdm.auto import tqdm
|
|
7
8
|
|
|
8
9
|
from mteb._requires_package import (
|
|
9
10
|
requires_image_dependencies,
|
|
10
11
|
requires_package,
|
|
11
12
|
)
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
13
|
from mteb.models.abs_encoder import AbsEncoder
|
|
14
14
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from torch.utils.data import DataLoader
|
|
18
|
+
|
|
19
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
20
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
16
21
|
|
|
17
22
|
from .colpali_models import (
|
|
18
23
|
COLPALI_CITATION,
|
|
@@ -329,32 +334,6 @@ colqwen3_4b = ModelMeta(
|
|
|
329
334
|
citation=TOMORO_CITATION,
|
|
330
335
|
)
|
|
331
336
|
|
|
332
|
-
colnomic_7b = ModelMeta(
|
|
333
|
-
loader=ColQwen2_5Wrapper,
|
|
334
|
-
loader_kwargs=dict(
|
|
335
|
-
torch_dtype=torch.float16,
|
|
336
|
-
),
|
|
337
|
-
name="nomic-ai/colnomic-embed-multimodal-7b",
|
|
338
|
-
model_type=["late-interaction"],
|
|
339
|
-
languages=["eng-Latn"],
|
|
340
|
-
revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f",
|
|
341
|
-
release_date="2025-03-31",
|
|
342
|
-
modalities=["image", "text"],
|
|
343
|
-
n_parameters=7_000_000_000,
|
|
344
|
-
memory_usage_mb=14400,
|
|
345
|
-
max_tokens=128000,
|
|
346
|
-
embed_dim=128,
|
|
347
|
-
license="apache-2.0",
|
|
348
|
-
open_weights=True,
|
|
349
|
-
public_training_code="https://github.com/nomic-ai/colpali",
|
|
350
|
-
public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
|
|
351
|
-
framework=["ColPali", "safetensors"],
|
|
352
|
-
reference="https://huggingface.co/nomic-ai/colnomic-embed-multimodal-7b",
|
|
353
|
-
similarity_fn_name="MaxSim",
|
|
354
|
-
use_instructions=True,
|
|
355
|
-
training_datasets=COLPALI_TRAINING_DATA,
|
|
356
|
-
citation=COLPALI_CITATION,
|
|
357
|
-
)
|
|
358
337
|
|
|
359
338
|
COLNOMIC_CITATION = """
|
|
360
339
|
@misc{nomicembedmultimodal2025,
|
|
@@ -402,7 +381,7 @@ colnomic_3b = ModelMeta(
|
|
|
402
381
|
)
|
|
403
382
|
|
|
404
383
|
colnomic_7b = ModelMeta(
|
|
405
|
-
loader=
|
|
384
|
+
loader=ColQwen2_5Wrapper,
|
|
406
385
|
loader_kwargs=dict(
|
|
407
386
|
torch_dtype=torch.float16,
|
|
408
387
|
),
|
|
@@ -56,7 +56,7 @@ colsmol_256m = ModelMeta(
|
|
|
56
56
|
name="vidore/colSmol-256M",
|
|
57
57
|
model_type=["late-interaction"],
|
|
58
58
|
languages=["eng-Latn"],
|
|
59
|
-
revision="
|
|
59
|
+
revision="a59110fdf114638b8018e6c9a018907e12f14855",
|
|
60
60
|
release_date="2025-01-22",
|
|
61
61
|
modalities=["image", "text"],
|
|
62
62
|
n_parameters=256_000_000,
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
4
|
import json
|
|
3
5
|
import logging
|
|
@@ -5,20 +7,24 @@ import os
|
|
|
5
7
|
import random
|
|
6
8
|
import string
|
|
7
9
|
import time
|
|
8
|
-
from typing import Any
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
9
11
|
|
|
10
12
|
import numpy as np
|
|
11
13
|
import requests
|
|
12
|
-
from torch.utils.data import DataLoader
|
|
13
14
|
|
|
14
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
15
|
from mteb.models.abs_encoder import AbsEncoder
|
|
16
16
|
from mteb.models.model_meta import ModelMeta
|
|
17
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
18
17
|
|
|
19
18
|
from .bge_models import bge_full_data
|
|
20
19
|
from .e5_instruct import E5_MISTRAL_TRAINING_DATA
|
|
21
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from torch.utils.data import DataLoader
|
|
23
|
+
|
|
24
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
25
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
26
|
+
|
|
27
|
+
|
|
22
28
|
conan_zh_datasets = {
|
|
23
29
|
"BQ",
|
|
24
30
|
"LCQMC",
|