mteb 2.3.10__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/_evaluators/image/imagetext_pairclassification_evaluator.py +7 -2
- mteb/abstasks/_statistics_calculation.py +6 -2
- mteb/abstasks/classification.py +0 -2
- mteb/benchmarks/benchmarks/__init__.py +2 -0
- mteb/benchmarks/benchmarks/benchmarks.py +57 -0
- mteb/deprecated_evaluator.py +8 -13
- mteb/descriptive_stats/Reranking/JQaRARerankingLite.json +35 -0
- mteb/descriptive_stats/Reranking/JaCWIRRerankingLite.json +35 -0
- mteb/descriptive_stats/Retrieval/JaCWIRRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/JaqketRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MIRACLJaRetrievalLite.json +30 -0
- mteb/descriptive_stats/Retrieval/MrTyDiJaRetrievalLite.json +30 -0
- mteb/evaluate.py +2 -33
- mteb/leaderboard/figures.py +1 -1
- mteb/leaderboard/table.py +1 -11
- mteb/models/abs_encoder.py +21 -17
- mteb/models/cache_wrappers/cache_backends/_hash_utils.py +2 -2
- mteb/models/get_model_meta.py +3 -123
- mteb/models/instruct_wrapper.py +2 -1
- mteb/models/model_implementations/bica_model.py +34 -0
- mteb/models/model_implementations/colpali_models.py +7 -2
- mteb/models/model_implementations/colqwen_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +9 -5
- mteb/models/model_implementations/google_models.py +10 -0
- mteb/models/model_implementations/granite_vision_embedding_models.py +6 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_models.py +1 -1
- mteb/models/model_implementations/mod_models.py +204 -0
- mteb/models/model_implementations/nomic_models.py +142 -4
- mteb/models/model_implementations/nomic_models_vision.py +6 -2
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +6 -2
- mteb/models/model_implementations/pylate_models.py +1 -4
- mteb/models/model_implementations/random_baseline.py +6 -2
- mteb/models/model_implementations/seed_1_6_embedding_models.py +7 -2
- mteb/models/model_implementations/voyage_v.py +6 -2
- mteb/models/model_meta.py +396 -19
- mteb/models/sentence_transformer_wrapper.py +2 -7
- mteb/tasks/reranking/jpn/__init__.py +9 -1
- mteb/tasks/reranking/jpn/j_qa_ra_reranking_lite.py +49 -0
- mteb/tasks/reranking/jpn/ja_cwir_reranking_lite.py +47 -0
- mteb/tasks/retrieval/code/fresh_stack_retrieval.py +8 -5
- mteb/tasks/retrieval/jpn/__init__.py +8 -0
- mteb/tasks/retrieval/jpn/ja_cwir_retrieval_lite.py +47 -0
- mteb/tasks/retrieval/jpn/jaqket_retrieval_lite.py +50 -0
- mteb/tasks/retrieval/jpn/miracl_ja_retrieval_lite.py +52 -0
- mteb/tasks/retrieval/jpn/mr_tydi_ja_retrieval_lite.py +48 -0
- mteb/types/_encoder_io.py +7 -2
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/METADATA +2 -1
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/RECORD +53 -39
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/WHEEL +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.3.10.dist-info → mteb-2.4.1.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
|
-
from typing import Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
7
|
import torch.nn.functional as F
|
|
6
8
|
from datasets import Dataset
|
|
7
|
-
from PIL.Image import Image
|
|
8
9
|
from torch.utils.data import DataLoader
|
|
9
10
|
|
|
10
11
|
from mteb._create_dataloaders import (
|
|
@@ -15,6 +16,10 @@ from mteb._requires_package import requires_image_dependencies
|
|
|
15
16
|
from mteb.abstasks.task_metadata import TaskMetadata
|
|
16
17
|
from mteb.models.models_protocols import EncoderProtocol
|
|
17
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from PIL.Image import Image
|
|
21
|
+
|
|
22
|
+
|
|
18
23
|
logger = logging.getLogger(__name__)
|
|
19
24
|
|
|
20
25
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import hashlib
|
|
2
4
|
from collections import Counter
|
|
3
|
-
|
|
4
|
-
from PIL import Image
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
5
6
|
|
|
6
7
|
from mteb.types import TopRankedDocumentsType
|
|
7
8
|
from mteb.types.statistics import (
|
|
@@ -13,6 +14,9 @@ from mteb.types.statistics import (
|
|
|
13
14
|
TopRankedStatistics,
|
|
14
15
|
)
|
|
15
16
|
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
16
20
|
|
|
17
21
|
def calculate_text_statistics(texts: list[str]) -> TextStatistics:
|
|
18
22
|
"""Calculate descriptive statistics for a list of texts.
|
mteb/abstasks/classification.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Any, TypedDict
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
from datasets import Dataset, DatasetDict
|
|
8
|
-
from PIL import ImageFile
|
|
9
8
|
from sklearn.linear_model import LogisticRegression
|
|
10
9
|
from sklearn.metrics import (
|
|
11
10
|
accuracy_score,
|
|
@@ -32,7 +31,6 @@ from ._statistics_calculation import (
|
|
|
32
31
|
)
|
|
33
32
|
from .abstask import AbsTask
|
|
34
33
|
|
|
35
|
-
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
36
34
|
logger = logging.getLogger(__name__)
|
|
37
35
|
|
|
38
36
|
|
|
@@ -12,6 +12,7 @@ from mteb.benchmarks.benchmarks.benchmarks import (
|
|
|
12
12
|
FA_MTEB_2,
|
|
13
13
|
HUME,
|
|
14
14
|
JINA_VDR,
|
|
15
|
+
JMTEB_LITE_V1,
|
|
15
16
|
JMTEB_V2,
|
|
16
17
|
LONG_EMBED,
|
|
17
18
|
MIEB_ENG,
|
|
@@ -76,6 +77,7 @@ __all__ = [
|
|
|
76
77
|
"HUME",
|
|
77
78
|
"HUME",
|
|
78
79
|
"JINA_VDR",
|
|
80
|
+
"JMTEB_LITE_V1",
|
|
79
81
|
"JMTEB_V2",
|
|
80
82
|
"LONG_EMBED",
|
|
81
83
|
"MIEB_ENG",
|
|
@@ -2650,3 +2650,60 @@ JMTEB_V2 = Benchmark(
|
|
|
2650
2650
|
""",
|
|
2651
2651
|
contacts=["lsz05"],
|
|
2652
2652
|
)
|
|
2653
|
+
|
|
2654
|
+
JMTEB_LITE_V1 = Benchmark(
|
|
2655
|
+
name="JMTEB-lite(v1)",
|
|
2656
|
+
display_name="Japanese",
|
|
2657
|
+
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
|
|
2658
|
+
tasks=get_tasks(
|
|
2659
|
+
languages=["jpn"],
|
|
2660
|
+
tasks=[
|
|
2661
|
+
# Clustering (3)
|
|
2662
|
+
"LivedoorNewsClustering.v2",
|
|
2663
|
+
"MewsC16JaClustering",
|
|
2664
|
+
"SIB200ClusteringS2S",
|
|
2665
|
+
# Classification (7)
|
|
2666
|
+
"AmazonReviewsClassification",
|
|
2667
|
+
"AmazonCounterfactualClassification",
|
|
2668
|
+
"MassiveIntentClassification",
|
|
2669
|
+
"MassiveScenarioClassification",
|
|
2670
|
+
"JapaneseSentimentClassification",
|
|
2671
|
+
"SIB200Classification",
|
|
2672
|
+
"WRIMEClassification",
|
|
2673
|
+
# STS (2)
|
|
2674
|
+
"JSTS",
|
|
2675
|
+
"JSICK",
|
|
2676
|
+
# Retrieval (11)
|
|
2677
|
+
"JaqketRetrievalLite",
|
|
2678
|
+
"MrTyDiJaRetrievalLite",
|
|
2679
|
+
"JaGovFaqsRetrieval",
|
|
2680
|
+
"NLPJournalTitleAbsRetrieval.V2",
|
|
2681
|
+
"NLPJournalTitleIntroRetrieval.V2",
|
|
2682
|
+
"NLPJournalAbsIntroRetrieval.V2",
|
|
2683
|
+
"NLPJournalAbsArticleRetrieval.V2",
|
|
2684
|
+
"JaCWIRRetrievalLite",
|
|
2685
|
+
"MIRACLJaRetrievalLite",
|
|
2686
|
+
"MintakaRetrieval",
|
|
2687
|
+
"MultiLongDocRetrieval",
|
|
2688
|
+
# Reranking (5)
|
|
2689
|
+
"ESCIReranking",
|
|
2690
|
+
"JQaRARerankingLite",
|
|
2691
|
+
"JaCWIRRerankingLite",
|
|
2692
|
+
"MIRACLReranking",
|
|
2693
|
+
"MultiLongDocReranking",
|
|
2694
|
+
],
|
|
2695
|
+
),
|
|
2696
|
+
description="JMTEB-lite is a lightweight version of JMTEB. It makes agile evaluation possible by reaching an average of 5x faster evaluation comparing with JMTEB, as 6 heavy datasets in JMTEB are optimized with hard negative pooling strategy, making them much smaller. The result of JMTEB-lite is proved to be highly relevant with that of JMTEB, making it a faithful preview of JMTEB.",
|
|
2697
|
+
reference="https://huggingface.co/datasets/sbintuitions/JMTEB-lite",
|
|
2698
|
+
citation=r"""
|
|
2699
|
+
@article{li2025jmteb,
|
|
2700
|
+
author = {Li, Shengzhe and Ohagi, Masaya and Ri, Ryokan and Fukuchi, Akihiko and Shibata, Tomohide and Kawahara, Daisuke},
|
|
2701
|
+
issue = {3},
|
|
2702
|
+
journal = {Vol.2025-NL-265,No.3,1-15},
|
|
2703
|
+
month = {sep},
|
|
2704
|
+
title = {{JMTEB and JMTEB-lite: Japanese Massive Text Embedding Benchmark and Its Lightweight Version}},
|
|
2705
|
+
year = {2025},
|
|
2706
|
+
}
|
|
2707
|
+
""",
|
|
2708
|
+
contacts=["lsz05"],
|
|
2709
|
+
)
|
mteb/deprecated_evaluator.py
CHANGED
|
@@ -13,21 +13,11 @@ from pathlib import Path
|
|
|
13
13
|
from time import time
|
|
14
14
|
from typing import TYPE_CHECKING, Any
|
|
15
15
|
|
|
16
|
-
from mteb.abstasks.task_metadata import TaskCategory, TaskType
|
|
17
|
-
from mteb.models.get_model_meta import (
|
|
18
|
-
_model_meta_from_cross_encoder,
|
|
19
|
-
_model_meta_from_sentence_transformers,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
if sys.version_info >= (3, 13):
|
|
23
|
-
from warnings import deprecated
|
|
24
|
-
else:
|
|
25
|
-
from typing_extensions import deprecated
|
|
26
|
-
|
|
27
16
|
import datasets
|
|
28
17
|
|
|
29
18
|
import mteb
|
|
30
19
|
from mteb.abstasks import AbsTask
|
|
20
|
+
from mteb.abstasks.task_metadata import TaskCategory, TaskType
|
|
31
21
|
from mteb.benchmarks import Benchmark
|
|
32
22
|
from mteb.models import (
|
|
33
23
|
CrossEncoderWrapper,
|
|
@@ -39,6 +29,11 @@ from mteb.models import (
|
|
|
39
29
|
from mteb.results import TaskResult
|
|
40
30
|
from mteb.types import ScoresDict
|
|
41
31
|
|
|
32
|
+
if sys.version_info >= (3, 13):
|
|
33
|
+
from warnings import deprecated
|
|
34
|
+
else:
|
|
35
|
+
from typing_extensions import deprecated
|
|
36
|
+
|
|
42
37
|
if TYPE_CHECKING:
|
|
43
38
|
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
44
39
|
|
|
@@ -669,9 +664,9 @@ class MTEB:
|
|
|
669
664
|
from sentence_transformers import CrossEncoder, SentenceTransformer
|
|
670
665
|
|
|
671
666
|
if isinstance(model, CrossEncoder):
|
|
672
|
-
meta =
|
|
667
|
+
meta = ModelMeta.from_cross_encoder(model)
|
|
673
668
|
elif isinstance(model, SentenceTransformer):
|
|
674
|
-
meta =
|
|
669
|
+
meta = ModelMeta.from_sentence_transformer_model(model)
|
|
675
670
|
else:
|
|
676
671
|
meta = ModelMeta(
|
|
677
672
|
loader=None,
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 91353,
|
|
4
|
+
"number_of_characters": 21318247,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 21231812,
|
|
7
|
+
"min_text_length": 14,
|
|
8
|
+
"average_text_length": 236.73496420846064,
|
|
9
|
+
"max_text_length": 438,
|
|
10
|
+
"unique_texts": 89683
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 86435,
|
|
15
|
+
"min_text_length": 16,
|
|
16
|
+
"average_text_length": 51.850629874025195,
|
|
17
|
+
"max_text_length": 118,
|
|
18
|
+
"unique_texts": 1667
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 16204,
|
|
23
|
+
"min_relevant_docs_per_query": 51,
|
|
24
|
+
"average_relevant_docs_per_query": 9.720455908818236,
|
|
25
|
+
"max_relevant_docs_per_query": 78,
|
|
26
|
+
"unique_relevant_docs": 89686
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": {
|
|
29
|
+
"num_top_ranked": 98941,
|
|
30
|
+
"min_top_ranked_per_query": 51,
|
|
31
|
+
"average_top_ranked_per_query": 59.35272945410918,
|
|
32
|
+
"max_top_ranked_per_query": 78
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 161744,
|
|
4
|
+
"number_of_characters": 29754484,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 29612965,
|
|
7
|
+
"min_text_length": 142,
|
|
8
|
+
"average_text_length": 187.72799599350847,
|
|
9
|
+
"max_text_length": 252,
|
|
10
|
+
"unique_texts": 156741
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 141519,
|
|
15
|
+
"min_text_length": 9,
|
|
16
|
+
"average_text_length": 35.37975,
|
|
17
|
+
"max_text_length": 176,
|
|
18
|
+
"unique_texts": 3993
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 3998,
|
|
23
|
+
"min_relevant_docs_per_query": 50,
|
|
24
|
+
"average_relevant_docs_per_query": 0.9995,
|
|
25
|
+
"max_relevant_docs_per_query": 51,
|
|
26
|
+
"unique_relevant_docs": 157744
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": {
|
|
29
|
+
"num_top_ranked": 204000,
|
|
30
|
+
"min_top_ranked_per_query": 51,
|
|
31
|
+
"average_top_ranked_per_query": 51.0,
|
|
32
|
+
"max_top_ranked_per_query": 51
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 306638,
|
|
4
|
+
"number_of_characters": 56607519,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 56466000,
|
|
7
|
+
"min_text_length": 142,
|
|
8
|
+
"average_text_length": 186.57934562084074,
|
|
9
|
+
"max_text_length": 252,
|
|
10
|
+
"unique_texts": 299096
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 141519,
|
|
15
|
+
"min_text_length": 9,
|
|
16
|
+
"average_text_length": 35.37975,
|
|
17
|
+
"max_text_length": 176,
|
|
18
|
+
"unique_texts": 3993
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 4000,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0,
|
|
25
|
+
"max_relevant_docs_per_query": 1,
|
|
26
|
+
"unique_relevant_docs": 4000
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 66799,
|
|
4
|
+
"number_of_characters": 280024895,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 279974341,
|
|
7
|
+
"min_text_length": 8,
|
|
8
|
+
"average_text_length": 4254.799869304884,
|
|
9
|
+
"max_text_length": 188424,
|
|
10
|
+
"unique_texts": 65802
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 50554,
|
|
15
|
+
"min_text_length": 16,
|
|
16
|
+
"average_text_length": 50.70611835506519,
|
|
17
|
+
"max_text_length": 98,
|
|
18
|
+
"unique_texts": 997
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 997,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0,
|
|
25
|
+
"max_relevant_docs_per_query": 1,
|
|
26
|
+
"unique_relevant_docs": 989
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 105924,
|
|
4
|
+
"number_of_characters": 20818958,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 20803724,
|
|
7
|
+
"min_text_length": 4,
|
|
8
|
+
"average_text_length": 198.01001294449097,
|
|
9
|
+
"max_text_length": 13231,
|
|
10
|
+
"unique_texts": 104988
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 15234,
|
|
15
|
+
"min_text_length": 7,
|
|
16
|
+
"average_text_length": 17.71395348837209,
|
|
17
|
+
"max_text_length": 48,
|
|
18
|
+
"unique_texts": 860
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 1790,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 2.0813953488372094,
|
|
25
|
+
"max_relevant_docs_per_query": 11,
|
|
26
|
+
"unique_relevant_docs": 1728
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 94102,
|
|
4
|
+
"number_of_characters": 17949014,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 17935995,
|
|
7
|
+
"min_text_length": 4,
|
|
8
|
+
"average_text_length": 192.07122357627807,
|
|
9
|
+
"max_text_length": 10778,
|
|
10
|
+
"unique_texts": 93122
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 13019,
|
|
15
|
+
"min_text_length": 6,
|
|
16
|
+
"average_text_length": 18.081944444444446,
|
|
17
|
+
"max_text_length": 44,
|
|
18
|
+
"unique_texts": 720
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 923,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.2819444444444446,
|
|
25
|
+
"max_relevant_docs_per_query": 3,
|
|
26
|
+
"unique_relevant_docs": 880
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
mteb/evaluate.py
CHANGED
|
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
from collections.abc import Iterable
|
|
5
|
-
from copy import deepcopy
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
from time import time
|
|
8
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
@@ -53,36 +52,6 @@ class OverwriteStrategy(HelpfulStrEnum):
|
|
|
53
52
|
ONLY_CACHE = "only-cache"
|
|
54
53
|
|
|
55
54
|
|
|
56
|
-
_empty_model_meta = ModelMeta(
|
|
57
|
-
loader=None,
|
|
58
|
-
name=None,
|
|
59
|
-
revision=None,
|
|
60
|
-
release_date=None,
|
|
61
|
-
languages=None,
|
|
62
|
-
framework=[],
|
|
63
|
-
similarity_fn_name=None,
|
|
64
|
-
n_parameters=None,
|
|
65
|
-
memory_usage_mb=None,
|
|
66
|
-
max_tokens=None,
|
|
67
|
-
embed_dim=None,
|
|
68
|
-
license=None,
|
|
69
|
-
open_weights=None,
|
|
70
|
-
public_training_code=None,
|
|
71
|
-
public_training_data=None,
|
|
72
|
-
use_instructions=None,
|
|
73
|
-
training_datasets=None,
|
|
74
|
-
modalities=[],
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def _create_empty_model_meta() -> ModelMeta:
|
|
79
|
-
logger.warning("Model metadata is missing. Using empty metadata.")
|
|
80
|
-
meta = deepcopy(_empty_model_meta)
|
|
81
|
-
meta.revision = "no_revision_available"
|
|
82
|
-
meta.name = "no_model_name_available"
|
|
83
|
-
return meta
|
|
84
|
-
|
|
85
|
-
|
|
86
55
|
def _sanitize_model(
|
|
87
56
|
model: ModelMeta | MTEBModels | SentenceTransformer | CrossEncoder,
|
|
88
57
|
) -> tuple[MTEBModels | ModelMeta, ModelMeta, ModelName, Revision]:
|
|
@@ -101,9 +70,9 @@ def _sanitize_model(
|
|
|
101
70
|
elif hasattr(model, "mteb_model_meta"):
|
|
102
71
|
meta = model.mteb_model_meta # type: ignore[attr-defined]
|
|
103
72
|
if not isinstance(meta, ModelMeta):
|
|
104
|
-
meta =
|
|
73
|
+
meta = ModelMeta.from_hub(None)
|
|
105
74
|
else:
|
|
106
|
-
meta =
|
|
75
|
+
meta = ModelMeta.from_hub(None) if not isinstance(model, ModelMeta) else model
|
|
107
76
|
|
|
108
77
|
model_name = cast(str, meta.name)
|
|
109
78
|
model_revision = cast(str, meta.revision)
|
mteb/leaderboard/figures.py
CHANGED
|
@@ -117,7 +117,7 @@ def _performance_size_plot(df: pd.DataFrame) -> go.Figure:
|
|
|
117
117
|
df["Max Tokens"] = df["Max Tokens"].map(_parse_float)
|
|
118
118
|
df["Log(Tokens)"] = np.log10(df["Max Tokens"])
|
|
119
119
|
df["Mean (Task)"] = df["Mean (Task)"].map(_parse_float)
|
|
120
|
-
df = df
|
|
120
|
+
df = df.dropna(
|
|
121
121
|
subset=["Mean (Task)", "Number of Parameters", "Embedding Dimensions"]
|
|
122
122
|
)
|
|
123
123
|
if not len(df.index):
|
mteb/leaderboard/table.py
CHANGED
|
@@ -26,16 +26,6 @@ def _format_scores(score: float) -> float:
|
|
|
26
26
|
return round(score * 100, 2)
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def _get_column_types(df: pd.DataFrame) -> list[str]:
|
|
30
|
-
types = []
|
|
31
|
-
for column_name in df.columns:
|
|
32
|
-
if is_numeric_dtype(df[column_name]):
|
|
33
|
-
types.append("number")
|
|
34
|
-
else:
|
|
35
|
-
types.append("str")
|
|
36
|
-
return types
|
|
37
|
-
|
|
38
|
-
|
|
39
29
|
def _get_column_widths(df: pd.DataFrame) -> list[str]:
|
|
40
30
|
# Please do not remove this function when refactoring.
|
|
41
31
|
# Column width calculation seeminlgy changes regularly with Gradio releases,
|
|
@@ -226,7 +216,7 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame:
|
|
|
226
216
|
gmap=gmap_values.loc[mask],
|
|
227
217
|
)
|
|
228
218
|
|
|
229
|
-
column_types =
|
|
219
|
+
column_types = ["auto" for _ in joint_table_style.data.columns]
|
|
230
220
|
# setting model name column to markdown
|
|
231
221
|
if len(column_types) > 1:
|
|
232
222
|
column_types[1] = "markdown"
|
mteb/models/abs_encoder.py
CHANGED
|
@@ -54,11 +54,11 @@ class AbsEncoder(ABC):
|
|
|
54
54
|
"""A wrapper function around the model.encode method that handles the prompt_name argument and standardizes the output to a numpy array.
|
|
55
55
|
|
|
56
56
|
The order of priorities for prompt selection are:
|
|
57
|
-
1. Composed prompt of task name + prompt type
|
|
57
|
+
1. Composed prompt of task name + prompt type
|
|
58
58
|
2. Specific task prompt
|
|
59
|
-
3. Composed prompt of task type + prompt type
|
|
59
|
+
3. Composed prompt of task type + prompt type
|
|
60
60
|
4. Specific task type prompt
|
|
61
|
-
5. Specific prompt type
|
|
61
|
+
5. Specific prompt type
|
|
62
62
|
|
|
63
63
|
Args:
|
|
64
64
|
task_metadata: The task name to use for building the encoding prompt
|
|
@@ -105,7 +105,7 @@ class AbsEncoder(ABC):
|
|
|
105
105
|
|
|
106
106
|
Args:
|
|
107
107
|
task_metadata: The metadata of the task.
|
|
108
|
-
prompt_type: The name type of prompt.
|
|
108
|
+
prompt_type: The name type of prompt.
|
|
109
109
|
"""
|
|
110
110
|
if not self.model_prompts:
|
|
111
111
|
return None
|
|
@@ -210,13 +210,11 @@ class AbsEncoder(ABC):
|
|
|
210
210
|
task_metadata: The metadata of the task. Sentence-transformers uses this to
|
|
211
211
|
determine which prompt to use from a specified dictionary.
|
|
212
212
|
The order of priorities for prompt selection are:
|
|
213
|
-
1.
|
|
214
|
-
2. Specific task prompt
|
|
215
|
-
3.
|
|
216
|
-
4.
|
|
217
|
-
|
|
218
|
-
6. Default prompt from the task definition
|
|
219
|
-
prompt_type: The name type of prompt. (query or passage)
|
|
213
|
+
1. Specific task prompt
|
|
214
|
+
2. Specific task type prompt
|
|
215
|
+
3. Specific prompt type
|
|
216
|
+
4. Default prompt from the task definition
|
|
217
|
+
prompt_type: The name type of prompt.
|
|
220
218
|
|
|
221
219
|
Returns:
|
|
222
220
|
The instruction/prompt to be used for encoding sentences.
|
|
@@ -224,6 +222,12 @@ class AbsEncoder(ABC):
|
|
|
224
222
|
prompt = task_metadata.prompt
|
|
225
223
|
if self.prompts_dict and task_metadata.name in self.prompts_dict:
|
|
226
224
|
prompt = self.prompts_dict[task_metadata.name]
|
|
225
|
+
elif self.prompts_dict and task_metadata.type in self.prompts_dict:
|
|
226
|
+
prompt = self.prompts_dict[task_metadata.type]
|
|
227
|
+
elif (
|
|
228
|
+
self.prompts_dict and prompt_type and prompt_type.value in self.prompts_dict
|
|
229
|
+
):
|
|
230
|
+
prompt = self.prompts_dict[prompt_type.value]
|
|
227
231
|
|
|
228
232
|
if isinstance(prompt, dict) and prompt_type:
|
|
229
233
|
if prompt.get(prompt_type.value):
|
|
@@ -246,7 +250,7 @@ class AbsEncoder(ABC):
|
|
|
246
250
|
|
|
247
251
|
Args:
|
|
248
252
|
instruction: The instruction to be formatted.
|
|
249
|
-
prompt_type: The name type of prompt.
|
|
253
|
+
prompt_type: The name type of prompt.
|
|
250
254
|
"""
|
|
251
255
|
if self.instruction_template is None:
|
|
252
256
|
raise ValueError(
|
|
@@ -269,7 +273,7 @@ class AbsEncoder(ABC):
|
|
|
269
273
|
|
|
270
274
|
Args:
|
|
271
275
|
task_metadata: The metadata of the task
|
|
272
|
-
prompt_type: The name type of prompt.
|
|
276
|
+
prompt_type: The name type of prompt.
|
|
273
277
|
|
|
274
278
|
Returns:
|
|
275
279
|
The instruction to be used for encoding sentences.
|
|
@@ -373,14 +377,14 @@ class AbsEncoder(ABC):
|
|
|
373
377
|
task_metadata: The metadata of the task. Sentence-transformers uses this to
|
|
374
378
|
determine which prompt to use from a specified dictionary.
|
|
375
379
|
The order of priorities for prompt selection are:
|
|
376
|
-
1. Composed prompt of task name + prompt type
|
|
380
|
+
1. Composed prompt of task name + prompt type
|
|
377
381
|
2. Specific task prompt
|
|
378
|
-
3. Composed prompt of task type + prompt type
|
|
382
|
+
3. Composed prompt of task type + prompt type
|
|
379
383
|
4. Specific task type prompt
|
|
380
|
-
5. Specific prompt type
|
|
384
|
+
5. Specific prompt type
|
|
381
385
|
hf_split: Split of current task
|
|
382
386
|
hf_subset: Subset of current task
|
|
383
|
-
prompt_type: The name type of prompt.
|
|
387
|
+
prompt_type: The name type of prompt.
|
|
384
388
|
**kwargs: Additional arguments to pass to the encoder.
|
|
385
389
|
|
|
386
390
|
Returns:
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
|
|
3
|
-
from PIL import Image
|
|
4
|
-
|
|
5
3
|
from mteb.types import BatchedInput
|
|
6
4
|
|
|
7
5
|
|
|
@@ -11,6 +9,8 @@ def _hash_item(item: BatchedInput) -> str:
|
|
|
11
9
|
item_hash = hashlib.sha256(item["text"].encode()).hexdigest()
|
|
12
10
|
|
|
13
11
|
if "image" in item:
|
|
12
|
+
from PIL import Image
|
|
13
|
+
|
|
14
14
|
image: Image.Image = item["image"]
|
|
15
15
|
item_hash += hashlib.sha256(image.tobytes()).hexdigest()
|
|
16
16
|
|