mteb 2.7.1__py3-none-any.whl → 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +2 -0
- mteb/_create_dataloaders.py +16 -9
- mteb/_evaluators/any_sts_evaluator.py +10 -5
- mteb/_evaluators/clustering_evaluator.py +10 -4
- mteb/_evaluators/evaluator.py +9 -4
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +6 -4
- mteb/_evaluators/pair_classification_evaluator.py +10 -5
- mteb/_evaluators/retrieval_evaluator.py +19 -13
- mteb/_evaluators/retrieval_metrics.py +9 -3
- mteb/_evaluators/sklearn_evaluator.py +14 -10
- mteb/_evaluators/text/bitext_mining_evaluator.py +8 -3
- mteb/_evaluators/text/summarization_evaluator.py +8 -4
- mteb/_evaluators/zeroshot_classification_evaluator.py +10 -3
- mteb/_helpful_enum.py +5 -1
- mteb/abstasks/_data_filter/filters.py +8 -2
- mteb/abstasks/_data_filter/task_pipelines.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -4
- mteb/abstasks/abstask.py +17 -9
- mteb/abstasks/aggregate_task_metadata.py +20 -9
- mteb/abstasks/aggregated_task.py +15 -8
- mteb/abstasks/classification.py +15 -6
- mteb/abstasks/clustering.py +17 -8
- mteb/abstasks/clustering_legacy.py +14 -6
- mteb/abstasks/image/image_text_pair_classification.py +17 -7
- mteb/abstasks/multilabel_classification.py +11 -5
- mteb/abstasks/pair_classification.py +19 -9
- mteb/abstasks/regression.py +14 -6
- mteb/abstasks/retrieval.py +27 -16
- mteb/abstasks/retrieval_dataset_loaders.py +11 -8
- mteb/abstasks/sts.py +19 -10
- mteb/abstasks/task_metadata.py +17 -8
- mteb/abstasks/text/bitext_mining.py +14 -7
- mteb/abstasks/text/summarization.py +17 -7
- mteb/abstasks/zeroshot_classification.py +15 -7
- mteb/benchmarks/_create_table.py +13 -3
- mteb/benchmarks/benchmark.py +11 -1
- mteb/benchmarks/benchmarks/rteb_benchmarks.py +20 -9
- mteb/cache.py +20 -14
- mteb/cli/_display_tasks.py +9 -3
- mteb/cli/build_cli.py +5 -2
- mteb/cli/generate_model_card.py +9 -2
- mteb/deprecated_evaluator.py +16 -12
- mteb/evaluate.py +20 -18
- mteb/filter_tasks.py +12 -7
- mteb/get_tasks.py +9 -4
- mteb/languages/language_scripts.py +8 -3
- mteb/leaderboard/app.py +7 -3
- mteb/leaderboard/table.py +7 -2
- mteb/load_results.py +9 -3
- mteb/models/abs_encoder.py +22 -12
- mteb/models/cache_wrappers/cache_backend_protocol.py +5 -3
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +8 -4
- mteb/models/cache_wrappers/cache_backends/faiss_cache.py +8 -3
- mteb/models/cache_wrappers/cache_wrapper.py +14 -9
- mteb/models/get_model_meta.py +11 -4
- mteb/models/instruct_wrapper.py +13 -5
- mteb/models/model_implementations/align_models.py +9 -4
- mteb/models/model_implementations/bedrock_models.py +16 -6
- mteb/models/model_implementations/blip2_models.py +9 -4
- mteb/models/model_implementations/blip_models.py +9 -4
- mteb/models/model_implementations/bm25.py +15 -10
- mteb/models/model_implementations/bmretriever_models.py +6 -2
- mteb/models/model_implementations/cde_models.py +9 -5
- mteb/models/model_implementations/clip_models.py +9 -4
- mteb/models/model_implementations/cohere_models.py +10 -4
- mteb/models/model_implementations/cohere_v.py +9 -4
- mteb/models/model_implementations/colpali_models.py +4 -3
- mteb/models/model_implementations/colqwen_models.py +10 -31
- mteb/models/model_implementations/colsmol_models.py +1 -1
- mteb/models/model_implementations/conan_models.py +10 -4
- mteb/models/model_implementations/dino_models.py +9 -4
- mteb/models/model_implementations/e5_v.py +9 -4
- mteb/models/model_implementations/eagerworks_models.py +10 -4
- mteb/models/model_implementations/evaclip_models.py +9 -4
- mteb/models/model_implementations/gme_v_models.py +5 -3
- mteb/models/model_implementations/google_models.py +10 -4
- mteb/models/model_implementations/granite_vision_embedding_models.py +6 -5
- mteb/models/model_implementations/hinvec_models.py +5 -1
- mteb/models/model_implementations/jasper_models.py +12 -5
- mteb/models/model_implementations/jina_clip.py +9 -4
- mteb/models/model_implementations/jina_models.py +10 -5
- mteb/models/model_implementations/kalm_models.py +18 -12
- mteb/models/model_implementations/linq_models.py +6 -1
- mteb/models/model_implementations/listconranker.py +9 -4
- mteb/models/model_implementations/llm2clip_models.py +9 -4
- mteb/models/model_implementations/llm2vec_models.py +12 -6
- mteb/models/model_implementations/mcinext_models.py +5 -2
- mteb/models/model_implementations/mdbr_models.py +3 -1
- mteb/models/model_implementations/{mxbai_models.py → mixedbread_ai_models.py} +91 -0
- mteb/models/model_implementations/moco_models.py +9 -4
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +10 -4
- mteb/models/model_implementations/no_instruct_sentence_models.py +12 -5
- mteb/models/model_implementations/nomic_models.py +10 -4
- mteb/models/model_implementations/nomic_models_vision.py +4 -3
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +7 -3
- mteb/models/model_implementations/nvidia_models.py +12 -4
- mteb/models/model_implementations/octen_models.py +1 -1
- mteb/models/model_implementations/openai_models.py +9 -4
- mteb/models/model_implementations/openclip_models.py +9 -4
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +10 -4
- mteb/models/model_implementations/ops_moa_models.py +7 -2
- mteb/models/model_implementations/pixie_models.py +56 -0
- mteb/models/model_implementations/promptriever_models.py +12 -6
- mteb/models/model_implementations/pylate_models.py +19 -13
- mteb/models/model_implementations/qwen3_models.py +8 -1
- mteb/models/model_implementations/random_baseline.py +4 -3
- mteb/models/model_implementations/repllama_models.py +13 -6
- mteb/models/model_implementations/rerankers_custom.py +10 -4
- mteb/models/model_implementations/rerankers_monot5_based.py +10 -4
- mteb/models/model_implementations/salesforce_models.py +7 -1
- mteb/models/model_implementations/seed_1_6_embedding_models.py +4 -2
- mteb/models/model_implementations/seed_1_6_embedding_models_1215.py +5 -2
- mteb/models/model_implementations/seed_models.py +1 -1
- mteb/models/model_implementations/siglip_models.py +9 -4
- mteb/models/model_implementations/slm_models.py +7 -4
- mteb/models/model_implementations/uae_models.py +9 -4
- mteb/models/model_implementations/vdr_models.py +7 -1
- mteb/models/model_implementations/vista_models.py +9 -4
- mteb/models/model_implementations/vlm2vec_models.py +9 -4
- mteb/models/model_implementations/voyage_models.py +10 -4
- mteb/models/model_implementations/voyage_v.py +10 -6
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +12 -7
- mteb/models/models_protocols.py +19 -18
- mteb/models/search_encoder_index/search_backend_protocol.py +7 -3
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +12 -4
- mteb/models/search_wrappers.py +19 -12
- mteb/models/sentence_transformer_wrapper.py +4 -3
- mteb/models/vllm_wrapper.py +8 -6
- mteb/results/benchmark_results.py +22 -17
- mteb/results/model_result.py +21 -15
- mteb/results/task_result.py +41 -10
- mteb/similarity_functions.py +8 -2
- mteb/tasks/aggregated_tasks/eng/cqadupstack_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts17_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/eng/sts_benchmark_multilingual_visual_sts_eng.py +3 -3
- mteb/tasks/aggregated_tasks/fas/cqadupstack_retrieval_fa.py +3 -3
- mteb/tasks/aggregated_tasks/fas/syn_per_chatbot_conv_sa_classification.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts17_multilingual_vision_sts.py +3 -3
- mteb/tasks/aggregated_tasks/multilingual/sts_benchmark_multilingual_visual_sts.py +3 -3
- mteb/tasks/aggregated_tasks/nld/cqadupstack_nl_retrieval.py +3 -3
- mteb/tasks/aggregated_tasks/pol/cqadupstack_retrieval_pl.py +3 -3
- mteb/tasks/clustering/nob/snl_clustering.py +7 -2
- mteb/tasks/clustering/nob/vg_clustering.py +7 -2
- mteb/tasks/retrieval/eng/limit_retrieval.py +6 -1
- mteb/tasks/retrieval/multilingual/ru_sci_bench_retrieval.py +3 -3
- mteb/types/_encoder_io.py +1 -1
- mteb/types/statistics.py +9 -2
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/METADATA +1 -1
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/RECORD +155 -154
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/WHEEL +0 -0
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/entry_points.txt +0 -0
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.7.1.dist-info → mteb-2.7.3.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,22 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
2
3
|
from itertools import islice
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import torch
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
8
|
|
|
9
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
10
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
11
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
12
|
-
from mteb.types import
|
|
11
|
+
from mteb.types import PromptType
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Generator
|
|
15
|
+
|
|
16
|
+
from torch.utils.data import DataLoader
|
|
17
|
+
|
|
18
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
|
+
from mteb.types import Array, BatchedInput
|
|
13
20
|
|
|
14
21
|
|
|
15
22
|
# https://docs.python.org/3/library/itertools.html#itertools.batched
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
7
|
import torch.nn.functional as F
|
|
6
8
|
from packaging.version import Version
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
9
|
|
|
9
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
11
|
from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
|
|
12
|
-
from mteb.types import
|
|
12
|
+
from mteb.types import PromptType
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from torch.utils.data import DataLoader
|
|
16
|
+
|
|
17
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
18
|
+
from mteb.types import Array, BatchedInput
|
|
13
19
|
|
|
14
20
|
logger = logging.getLogger(__name__)
|
|
15
21
|
|
|
@@ -4,17 +4,18 @@ from typing import TYPE_CHECKING, Any
|
|
|
4
4
|
|
|
5
5
|
import torch
|
|
6
6
|
import torch.nn.functional as F
|
|
7
|
-
from torch.utils.data import DataLoader
|
|
8
7
|
from tqdm.auto import tqdm
|
|
9
8
|
|
|
10
9
|
from mteb._requires_package import requires_package
|
|
11
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
12
10
|
from mteb.models.abs_encoder import AbsEncoder
|
|
13
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
14
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
15
12
|
|
|
16
13
|
if TYPE_CHECKING:
|
|
17
14
|
from PIL import Image
|
|
15
|
+
from torch.utils.data import DataLoader
|
|
16
|
+
|
|
17
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
18
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
18
19
|
|
|
19
20
|
NOMIC_EMBED_VISION_CITATION = """@article{nussbaum2024nomicembedvision,
|
|
20
21
|
title={Nomic Embed Vision: Expanding the Latent Space},
|
|
@@ -1,14 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
6
|
from packaging.version import Version
|
|
5
7
|
from torch.utils.data import DataLoader
|
|
6
8
|
from transformers import __version__ as transformers_version
|
|
7
9
|
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
10
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
11
|
from mteb.models.model_meta import ModelMeta
|
|
11
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
12
16
|
|
|
13
17
|
LLAMA_NEMORETRIEVER_CITATION = """@misc{xu2025llamanemoretrievercolembedtopperforming,
|
|
14
18
|
title={Llama Nemoretriever Colembed: Top-Performing Text-Image Retrieval Model},
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import torch
|
|
6
7
|
import torch.nn.functional as F
|
|
7
8
|
from packaging.version import Version
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
9
|
from tqdm import tqdm
|
|
10
10
|
from transformers import AutoModel, AutoTokenizer
|
|
11
11
|
from transformers import __version__ as transformers_version
|
|
@@ -16,7 +16,15 @@ from mteb.models import CrossEncoderWrapper
|
|
|
16
16
|
from mteb.models.abs_encoder import AbsEncoder
|
|
17
17
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
18
18
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
19
|
-
from mteb.types import
|
|
19
|
+
from mteb.types import PromptType
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from collections.abc import Callable
|
|
23
|
+
|
|
24
|
+
from torch.utils.data import DataLoader
|
|
25
|
+
|
|
26
|
+
from mteb import TaskMetadata
|
|
27
|
+
from mteb.types import Array, BatchedInput
|
|
20
28
|
|
|
21
29
|
logger = logging.getLogger(__name__)
|
|
22
30
|
|
|
@@ -1,15 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any, ClassVar
|
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
3
5
|
|
|
4
6
|
import numpy as np
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
from tqdm.auto import tqdm
|
|
7
8
|
|
|
8
9
|
from mteb._requires_package import requires_package
|
|
9
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
10
10
|
from mteb.models.abs_encoder import AbsEncoder
|
|
11
11
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
12
|
-
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
13
18
|
|
|
14
19
|
logger = logging.getLogger(__name__)
|
|
15
20
|
|
|
@@ -1,14 +1,19 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
8
|
from mteb._requires_package import requires_image_dependencies, requires_package
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
10
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from torch.utils.data import DataLoader
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
12
17
|
|
|
13
18
|
OPENCLIP_CITATION = """@inproceedings{cherti2023reproducible,
|
|
14
19
|
title={Reproducible scaling laws for contrastive language-image learning},
|
|
@@ -1,12 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
|
|
6
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
7
7
|
from mteb.models.abs_encoder import AbsEncoder
|
|
8
8
|
from mteb.models.model_meta import ModelMeta
|
|
9
|
-
from mteb.types import
|
|
9
|
+
from mteb.types import PromptType
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput
|
|
10
16
|
|
|
11
17
|
v2_training_data = {
|
|
12
18
|
"MSMARCO",
|
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
2
4
|
|
|
3
5
|
from mteb.models.abs_encoder import AbsEncoder
|
|
4
6
|
from mteb.models.model_meta import ModelMeta
|
|
5
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from mteb.types import Array
|
|
10
|
+
|
|
6
11
|
|
|
7
12
|
class OPSWrapper(AbsEncoder):
|
|
8
13
|
def __init__(self, model_name: str, revision: str):
|
|
@@ -15,7 +20,7 @@ class OPSWrapper(AbsEncoder):
|
|
|
15
20
|
)
|
|
16
21
|
self.output_dim = 1536
|
|
17
22
|
|
|
18
|
-
def encode(self, sentences: list[str], **kwargs) ->
|
|
23
|
+
def encode(self, sentences: list[str], **kwargs) -> Array:
|
|
19
24
|
embeddings = self.model.encode(sentences, **kwargs)
|
|
20
25
|
return embeddings[:, : self.output_dim]
|
|
21
26
|
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from mteb.models.model_implementations.arctic_models import (
|
|
2
|
+
ARCTIC_V2_CITATION,
|
|
3
|
+
LANGUAGES_V2_0,
|
|
4
|
+
arctic_v2_training_datasets,
|
|
5
|
+
)
|
|
6
|
+
from mteb.models.model_meta import (
|
|
7
|
+
ModelMeta,
|
|
8
|
+
ScoringFunction,
|
|
9
|
+
)
|
|
10
|
+
from mteb.models.sentence_transformer_wrapper import sentence_transformers_loader
|
|
11
|
+
|
|
12
|
+
PIXIE_RUNE_V1_CITATION = """@misc{TelePIX-PIXIE-Rune-v1.0,
|
|
13
|
+
title = {PIXIE-Rune-v1.0},
|
|
14
|
+
author = {TelePIX AI Research Team and Bongmin Kim},
|
|
15
|
+
year = {2026},
|
|
16
|
+
howpublished = {Hugging Face model card},
|
|
17
|
+
url = {https://huggingface.co/telepix/PIXIE-Rune-v1.0}
|
|
18
|
+
}"""
|
|
19
|
+
|
|
20
|
+
PIXIE_RUNE_V1_PROMPTS = {
|
|
21
|
+
"query": "query: ",
|
|
22
|
+
"document": "",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# it is further fine-tuned on TelePIX proprietary IR data (not public).
|
|
26
|
+
pixie_rune_v1_training_datasets = set(arctic_v2_training_datasets) | {
|
|
27
|
+
"TelePIX-Proprietary-IR-Triplets",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
pixie_rune_v1_0 = ModelMeta(
|
|
31
|
+
loader=sentence_transformers_loader,
|
|
32
|
+
loader_kwargs={
|
|
33
|
+
"model_prompts": PIXIE_RUNE_V1_PROMPTS,
|
|
34
|
+
},
|
|
35
|
+
name="telepix/PIXIE-Rune-v1.0",
|
|
36
|
+
model_type=["dense"],
|
|
37
|
+
revision="b2486496da71191626666a88f9bfec844933a134",
|
|
38
|
+
release_date="2026-01-15",
|
|
39
|
+
languages=LANGUAGES_V2_0,
|
|
40
|
+
open_weights=True,
|
|
41
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors"],
|
|
42
|
+
n_parameters=567754752,
|
|
43
|
+
memory_usage_mb=2166,
|
|
44
|
+
max_tokens=6144,
|
|
45
|
+
embed_dim=1024,
|
|
46
|
+
license="apache-2.0",
|
|
47
|
+
reference="https://huggingface.co/telepix/PIXIE-Rune-v1.0",
|
|
48
|
+
similarity_fn_name=ScoringFunction.COSINE,
|
|
49
|
+
use_instructions=True,
|
|
50
|
+
adapted_from="Snowflake/snowflake-arctic-embed-l-v2.0",
|
|
51
|
+
superseded_by=None,
|
|
52
|
+
public_training_code=None,
|
|
53
|
+
public_training_data=None,
|
|
54
|
+
training_datasets=pixie_rune_v1_training_datasets,
|
|
55
|
+
citation=PIXIE_RUNE_V1_CITATION + "\n\n" + ARCTIC_V2_CITATION,
|
|
56
|
+
)
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import torch
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
7
|
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
8
|
from mteb.models.abs_encoder import AbsEncoder
|
|
10
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
18
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
13
19
|
|
|
14
20
|
from .repllama_models import RepLLaMAModel, model_prompts
|
|
15
21
|
|
|
@@ -1,30 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import heapq
|
|
2
4
|
import logging
|
|
3
5
|
import shutil
|
|
4
6
|
import tempfile
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
7
9
|
|
|
8
10
|
import torch
|
|
9
|
-
from torch.utils.data import DataLoader
|
|
10
11
|
|
|
11
12
|
from mteb._create_dataloaders import (
|
|
12
13
|
create_dataloader,
|
|
13
14
|
)
|
|
14
15
|
from mteb._requires_package import requires_package
|
|
15
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
16
|
from mteb.models.abs_encoder import AbsEncoder
|
|
17
17
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
18
|
-
from mteb.types import
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
18
|
+
from mteb.types import PromptType
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from torch.utils.data import DataLoader
|
|
22
|
+
|
|
23
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
24
|
+
from mteb.types import (
|
|
25
|
+
Array,
|
|
26
|
+
BatchedInput,
|
|
27
|
+
CorpusDatasetType,
|
|
28
|
+
EncodeKwargs,
|
|
29
|
+
QueryDatasetType,
|
|
30
|
+
RetrievalOutputType,
|
|
31
|
+
TopRankedDocumentsType,
|
|
32
|
+
)
|
|
33
|
+
|
|
28
34
|
|
|
29
35
|
logger = logging.getLogger(__name__)
|
|
30
36
|
|
|
@@ -1,6 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
6
|
from mteb.models.model_meta import ModelMeta
|
|
3
|
-
from mteb.
|
|
7
|
+
from mteb.types import PromptType
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
4
11
|
|
|
5
12
|
|
|
6
13
|
def instruction_template(
|
|
@@ -5,18 +5,19 @@ from typing import TYPE_CHECKING, Any, Literal
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import torch
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
8
|
|
|
10
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
11
9
|
from mteb.models.model_meta import ModelMeta
|
|
12
10
|
from mteb.similarity_functions import (
|
|
13
11
|
select_pairwise_similarity,
|
|
14
12
|
select_similarity,
|
|
15
13
|
)
|
|
16
|
-
from mteb.types._encoder_io import Array, BatchedInput, PromptType
|
|
17
14
|
|
|
18
15
|
if TYPE_CHECKING:
|
|
19
16
|
from PIL import Image
|
|
17
|
+
from torch.utils.data import DataLoader
|
|
18
|
+
|
|
19
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
20
|
+
from mteb.types._encoder_io import Array, BatchedInput, PromptType
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
def _string_to_vector(text: str | None, size: int) -> np.ndarray:
|
|
@@ -1,22 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from
|
|
3
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
import torch
|
|
7
8
|
import torch.nn.functional as F
|
|
8
|
-
from torch.utils.data import DataLoader
|
|
9
9
|
from tqdm.auto import tqdm
|
|
10
10
|
|
|
11
11
|
from mteb._requires_package import requires_package
|
|
12
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
13
12
|
from mteb.models.abs_encoder import AbsEncoder
|
|
14
13
|
from mteb.models.model_meta import (
|
|
15
14
|
ModelMeta,
|
|
16
15
|
ScoringFunction,
|
|
17
16
|
)
|
|
18
|
-
from mteb.
|
|
19
|
-
|
|
17
|
+
from mteb.types import PromptType
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Callable
|
|
21
|
+
|
|
22
|
+
from torch.utils.data import DataLoader
|
|
23
|
+
|
|
24
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
25
|
+
from mteb.models.models_protocols import EncoderProtocol
|
|
26
|
+
from mteb.types import Array, BatchedInput
|
|
20
27
|
|
|
21
28
|
logger = logging.getLogger(__name__)
|
|
22
29
|
|
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
|
|
7
8
|
from mteb._requires_package import requires_package
|
|
8
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
9
9
|
from mteb.models.model_meta import ModelMeta
|
|
10
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
11
10
|
|
|
12
11
|
from .bge_models import bge_m3_training_data
|
|
13
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from torch.utils.data import DataLoader
|
|
15
|
+
|
|
16
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
18
|
+
|
|
19
|
+
|
|
14
20
|
logger = logging.getLogger(__name__)
|
|
15
21
|
|
|
16
22
|
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.model_meta import ModelMeta
|
|
9
|
-
from mteb.types import Array, BatchedInput, PromptType
|
|
10
9
|
|
|
11
10
|
from .rerankers_custom import RerankerWrapper
|
|
12
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from torch.utils.data import DataLoader
|
|
14
|
+
|
|
15
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
17
|
+
|
|
18
|
+
|
|
13
19
|
logger = logging.getLogger(__name__)
|
|
14
20
|
|
|
15
21
|
|
|
@@ -1,12 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
from mteb.models.instruct_wrapper import (
|
|
2
6
|
InstructSentenceTransformerModel,
|
|
3
7
|
instruct_wrapper,
|
|
4
8
|
)
|
|
5
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
6
|
-
from mteb.types import PromptType
|
|
7
10
|
|
|
8
11
|
from .e5_instruct import E5_MISTRAL_TRAINING_DATA
|
|
9
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from mteb.types import PromptType
|
|
15
|
+
|
|
10
16
|
|
|
11
17
|
def instruction_template(
|
|
12
18
|
instruction: str, prompt_type: PromptType | None = None
|
|
@@ -13,16 +13,18 @@ import torch
|
|
|
13
13
|
from torch.utils.data import DataLoader
|
|
14
14
|
|
|
15
15
|
from mteb._requires_package import requires_package
|
|
16
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
17
16
|
from mteb.models.abs_encoder import AbsEncoder
|
|
18
17
|
from mteb.models.model_implementations.bge_models import bge_chinese_training_data
|
|
19
18
|
from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
|
|
20
19
|
from mteb.models.model_meta import ModelMeta
|
|
21
|
-
from mteb.types import
|
|
20
|
+
from mteb.types import PromptType
|
|
22
21
|
|
|
23
22
|
if TYPE_CHECKING:
|
|
24
23
|
from PIL import Image
|
|
25
24
|
|
|
25
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
26
|
+
from mteb.types import Array, BatchedInput
|
|
27
|
+
|
|
26
28
|
|
|
27
29
|
logger = logging.getLogger(__name__)
|
|
28
30
|
|
|
@@ -15,15 +15,18 @@ from torch.utils.data import DataLoader
|
|
|
15
15
|
from tqdm import tqdm
|
|
16
16
|
|
|
17
17
|
from mteb._requires_package import requires_package
|
|
18
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
19
18
|
from mteb.models.abs_encoder import AbsEncoder
|
|
20
19
|
from mteb.models.model_implementations.bge_models import bge_chinese_training_data
|
|
21
20
|
from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
|
|
22
21
|
from mteb.models.model_meta import ModelMeta
|
|
23
|
-
from mteb.types import
|
|
22
|
+
from mteb.types import PromptType
|
|
24
23
|
|
|
25
24
|
if TYPE_CHECKING:
|
|
26
25
|
from PIL import Image
|
|
26
|
+
from torch.utils.data import DataLoader
|
|
27
|
+
|
|
28
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
29
|
+
from mteb.types import Array, BatchedInput
|
|
27
30
|
|
|
28
31
|
|
|
29
32
|
logger = logging.getLogger(__name__)
|
|
@@ -9,7 +9,7 @@ from tqdm.auto import tqdm
|
|
|
9
9
|
from mteb._requires_package import requires_package
|
|
10
10
|
from mteb.models.abs_encoder import AbsEncoder
|
|
11
11
|
from mteb.models.model_meta import ModelMeta
|
|
12
|
-
from mteb.
|
|
12
|
+
from mteb.types import PromptType
|
|
13
13
|
|
|
14
14
|
from .bge_models import bge_chinese_training_data
|
|
15
15
|
from .nvidia_models import nvidia_training_datasets
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
2
4
|
|
|
3
5
|
import torch
|
|
4
|
-
from torch.utils.data import DataLoader
|
|
5
6
|
from tqdm.auto import tqdm
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.abs_encoder import AbsEncoder
|
|
9
9
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
11
16
|
|
|
12
17
|
SIGLIP_CITATION = """@misc{zhai2023sigmoid,
|
|
13
18
|
title={Sigmoid Loss for Language Image Pre-Training},
|
|
@@ -13,24 +13,27 @@ Based on:
|
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
|
-
from typing import Any
|
|
16
|
+
from typing import TYPE_CHECKING, Any
|
|
17
17
|
|
|
18
18
|
import torch
|
|
19
|
-
from torch.utils.data import DataLoader
|
|
20
19
|
from tqdm.auto import tqdm
|
|
21
20
|
|
|
22
21
|
from mteb._requires_package import (
|
|
23
22
|
requires_image_dependencies,
|
|
24
23
|
requires_package,
|
|
25
24
|
)
|
|
26
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
27
25
|
from mteb.models.abs_encoder import AbsEncoder
|
|
28
26
|
from mteb.models.model_implementations.colpali_models import (
|
|
29
27
|
COLPALI_CITATION,
|
|
30
28
|
COLPALI_TRAINING_DATA,
|
|
31
29
|
)
|
|
32
30
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
33
|
-
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from torch.utils.data import DataLoader
|
|
34
|
+
|
|
35
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
36
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
34
37
|
|
|
35
38
|
logger = logging.getLogger(__name__)
|
|
36
39
|
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
|
-
from torch.utils.data import DataLoader
|
|
6
7
|
|
|
7
|
-
from mteb.abstasks.task_metadata import TaskMetadata
|
|
8
8
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
9
9
|
from mteb.models.sentence_transformer_wrapper import SentenceTransformerEncoderWrapper
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from torch.utils.data import DataLoader
|
|
13
|
+
|
|
14
|
+
from mteb.abstasks.task_metadata import TaskMetadata
|
|
15
|
+
from mteb.types import Array, BatchedInput, PromptType
|
|
11
16
|
|
|
12
17
|
logger = logging.getLogger(__name__)
|
|
13
18
|
|
|
@@ -1,6 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
1
5
|
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
|
|
2
6
|
from mteb.models.model_meta import ModelMeta, ScoringFunction
|
|
3
|
-
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from mteb.types import PromptType
|
|
4
10
|
|
|
5
11
|
|
|
6
12
|
def instruction_template(
|