mteb 2.6.4__py3-none-any.whl → 2.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/abstasks/classification.py +2 -3
- mteb/abstasks/multilabel_classification.py +3 -3
- mteb/abstasks/regression.py +1 -1
- mteb/abstasks/retrieval.py +1 -1
- mteb/abstasks/task_metadata.py +9 -14
- mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
- mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
- mteb/models/model_implementations/align_models.py +1 -1
- mteb/models/model_implementations/andersborges.py +2 -2
- mteb/models/model_implementations/ara_models.py +1 -1
- mteb/models/model_implementations/arctic_models.py +8 -8
- mteb/models/model_implementations/b1ade_models.py +1 -1
- mteb/models/model_implementations/bge_models.py +45 -21
- mteb/models/model_implementations/bica_model.py +3 -3
- mteb/models/model_implementations/blip2_models.py +2 -2
- mteb/models/model_implementations/blip_models.py +8 -8
- mteb/models/model_implementations/bmretriever_models.py +4 -4
- mteb/models/model_implementations/cadet_models.py +1 -1
- mteb/models/model_implementations/cde_models.py +2 -2
- mteb/models/model_implementations/clip_models.py +3 -3
- mteb/models/model_implementations/clips_models.py +3 -3
- mteb/models/model_implementations/codefuse_models.py +5 -5
- mteb/models/model_implementations/codesage_models.py +3 -3
- mteb/models/model_implementations/cohere_models.py +4 -4
- mteb/models/model_implementations/colpali_models.py +3 -3
- mteb/models/model_implementations/colqwen_models.py +8 -8
- mteb/models/model_implementations/colsmol_models.py +2 -2
- mteb/models/model_implementations/conan_models.py +1 -1
- mteb/models/model_implementations/dino_models.py +19 -19
- mteb/models/model_implementations/e5_instruct.py +23 -4
- mteb/models/model_implementations/e5_models.py +9 -9
- mteb/models/model_implementations/e5_v.py +1 -1
- mteb/models/model_implementations/eagerworks_models.py +1 -1
- mteb/models/model_implementations/emillykkejensen_models.py +3 -3
- mteb/models/model_implementations/en_code_retriever.py +1 -1
- mteb/models/model_implementations/euler_models.py +2 -2
- mteb/models/model_implementations/fa_models.py +9 -9
- mteb/models/model_implementations/facebookai.py +14 -2
- mteb/models/model_implementations/geogpt_models.py +1 -1
- mteb/models/model_implementations/gme_v_models.py +2 -2
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
- mteb/models/model_implementations/gritlm_models.py +2 -2
- mteb/models/model_implementations/gte_models.py +25 -13
- mteb/models/model_implementations/hinvec_models.py +1 -1
- mteb/models/model_implementations/ibm_granite_models.py +30 -6
- mteb/models/model_implementations/inf_models.py +2 -2
- mteb/models/model_implementations/jasper_models.py +2 -2
- mteb/models/model_implementations/jina_clip.py +1 -1
- mteb/models/model_implementations/jina_models.py +11 -5
- mteb/models/model_implementations/kblab.py +12 -6
- mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
- mteb/models/model_implementations/kfst.py +1 -1
- mteb/models/model_implementations/kowshik24_models.py +1 -1
- mteb/models/model_implementations/lgai_embedding_models.py +1 -1
- mteb/models/model_implementations/linq_models.py +1 -1
- mteb/models/model_implementations/listconranker.py +1 -1
- mteb/models/model_implementations/llm2clip_models.py +3 -3
- mteb/models/model_implementations/llm2vec_models.py +8 -8
- mteb/models/model_implementations/mdbr_models.py +14 -2
- mteb/models/model_implementations/misc_models.py +68 -68
- mteb/models/model_implementations/mme5_models.py +1 -1
- mteb/models/model_implementations/moco_models.py +2 -2
- mteb/models/model_implementations/mod_models.py +1 -1
- mteb/models/model_implementations/model2vec_models.py +13 -13
- mteb/models/model_implementations/moka_models.py +1 -1
- mteb/models/model_implementations/mxbai_models.py +16 -3
- mteb/models/model_implementations/nbailab.py +3 -3
- mteb/models/model_implementations/no_instruct_sentence_models.py +1 -1
- mteb/models/model_implementations/nomic_models.py +18 -6
- mteb/models/model_implementations/nomic_models_vision.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -2
- mteb/models/model_implementations/nvidia_models.py +3 -3
- mteb/models/model_implementations/octen_models.py +3 -3
- mteb/models/model_implementations/openclip_models.py +6 -6
- mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
- mteb/models/model_implementations/ops_moa_models.py +1 -1
- mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
- mteb/models/model_implementations/pawan_models.py +1 -1
- mteb/models/model_implementations/piccolo_models.py +1 -1
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/pylate_models.py +5 -5
- mteb/models/model_implementations/qodo_models.py +2 -2
- mteb/models/model_implementations/qtack_models.py +1 -1
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +2 -2
- mteb/models/model_implementations/rasgaard_models.py +1 -1
- mteb/models/model_implementations/reasonir_model.py +1 -1
- mteb/models/model_implementations/repllama_models.py +1 -1
- mteb/models/model_implementations/rerankers_custom.py +9 -3
- mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
- mteb/models/model_implementations/richinfoai_models.py +1 -1
- mteb/models/model_implementations/ru_sentence_models.py +20 -20
- mteb/models/model_implementations/ruri_models.py +10 -10
- mteb/models/model_implementations/salesforce_models.py +3 -3
- mteb/models/model_implementations/samilpwc_models.py +1 -1
- mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
- mteb/models/model_implementations/searchmap_models.py +1 -1
- mteb/models/model_implementations/sentence_transformers_models.py +58 -22
- mteb/models/model_implementations/shuu_model.py +1 -1
- mteb/models/model_implementations/siglip_models.py +10 -10
- mteb/models/model_implementations/slm_models.py +416 -0
- mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
- mteb/models/model_implementations/stella_models.py +17 -4
- mteb/models/model_implementations/tarka_models.py +2 -2
- mteb/models/model_implementations/text2vec_models.py +9 -3
- mteb/models/model_implementations/ua_sentence_models.py +1 -1
- mteb/models/model_implementations/uae_models.py +7 -1
- mteb/models/model_implementations/vdr_models.py +1 -1
- mteb/models/model_implementations/vi_vn_models.py +6 -6
- mteb/models/model_implementations/vlm2vec_models.py +2 -2
- mteb/models/model_implementations/youtu_models.py +1 -1
- mteb/models/model_implementations/yuan_models.py +1 -1
- mteb/models/model_implementations/yuan_models_en.py +1 -1
- mteb/models/model_meta.py +46 -17
- mteb/results/benchmark_results.py +2 -2
- mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
- mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
- mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
- mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
- mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
- mteb/tasks/retrieval/vie/__init__.py +14 -6
- mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
- mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
- mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
- mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
- {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/METADATA +3 -3
- {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/RECORD +142 -133
- {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/WHEEL +0 -0
- {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/entry_points.txt +0 -0
- {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/top_level.txt +0 -0
mteb/abstasks/classification.py
CHANGED
|
@@ -98,9 +98,8 @@ class AbsTaskClassification(AbsTask):
|
|
|
98
98
|
text: str (for text) or PIL.Image (for image). Column name can be changed via `input_column_name` attribute.
|
|
99
99
|
label: int. Column name can be changed via `label_column_name` attribute.
|
|
100
100
|
evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LogisticRegression`.
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
n_experiments: Number of experiments to run. Default is 10.
|
|
101
|
+
samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
|
|
102
|
+
n_experiments: Number of experiments to run. Default is 10.
|
|
104
103
|
train_split: Name of the split to use for training the evaluator model. Default is "train".
|
|
105
104
|
label_column_name: Name of the column containing the labels. Default is "label".
|
|
106
105
|
input_column_name: Name of the column containing the input data. Default is "text".
|
|
@@ -70,10 +70,10 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
70
70
|
input_column_name: Name of the column containing the input text.
|
|
71
71
|
label_column_name: Name of the column containing the labels.
|
|
72
72
|
samples_per_label: Number of samples to use pr. label. These samples are embedded and a classifier is fit using the labels and samples.
|
|
73
|
-
|
|
73
|
+
evaluator_model: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
|
|
74
74
|
"""
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
evaluator_model: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
|
|
77
77
|
input_column_name: str = "text"
|
|
78
78
|
label_column_name: str = "label"
|
|
79
79
|
|
|
@@ -169,7 +169,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
|
|
|
169
169
|
y_train = train_split.select(sample_indices)[self.label_column_name]
|
|
170
170
|
y_train = binarizer.transform(y_train)
|
|
171
171
|
y_pred, current_classifier = _evaluate_classifier(
|
|
172
|
-
X_train, y_train, X_test, self.
|
|
172
|
+
X_train, y_train, X_test, self.evaluator_model
|
|
173
173
|
)
|
|
174
174
|
if prediction_folder:
|
|
175
175
|
all_predictions.append(y_pred.tolist())
|
mteb/abstasks/regression.py
CHANGED
|
@@ -84,7 +84,7 @@ class AbsTaskRegression(AbsTaskClassification):
|
|
|
84
84
|
n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
|
|
85
85
|
abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
|
|
86
86
|
evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
|
|
87
|
-
|
|
87
|
+
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
90
|
evaluator: type[SklearnEvaluator] = SklearnEvaluator
|
mteb/abstasks/retrieval.py
CHANGED
mteb/abstasks/task_metadata.py
CHANGED
|
@@ -485,7 +485,6 @@ class TaskMetadata(BaseModel):
|
|
|
485
485
|
dataset_type = [
|
|
486
486
|
*self._hf_task_type(),
|
|
487
487
|
*self._hf_task_category(),
|
|
488
|
-
*self._hf_subtypes(),
|
|
489
488
|
]
|
|
490
489
|
languages = self._hf_languages()
|
|
491
490
|
|
|
@@ -587,10 +586,8 @@ class TaskMetadata(BaseModel):
|
|
|
587
586
|
|
|
588
587
|
def _hf_subtypes(self) -> list[str]:
|
|
589
588
|
# to get full list of available task_ids execute
|
|
590
|
-
#
|
|
591
|
-
#
|
|
592
|
-
# "repoType": "dataset"
|
|
593
|
-
# })
|
|
589
|
+
# https://huggingface.co/api/datasets-tags-by-type?type=task_ids
|
|
590
|
+
# ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
|
|
594
591
|
mteb_to_hf_subtype = {
|
|
595
592
|
"Article retrieval": ["document-retrieval"],
|
|
596
593
|
"Conversational retrieval": ["conversational", "utterance-retrieval"],
|
|
@@ -612,7 +609,7 @@ class TaskMetadata(BaseModel):
|
|
|
612
609
|
"hate-speech-detection",
|
|
613
610
|
],
|
|
614
611
|
"Thematic clustering": [],
|
|
615
|
-
"Scientific Reranking": [],
|
|
612
|
+
"Scientific Reranking": ["text-scoring"],
|
|
616
613
|
"Claim verification": ["fact-checking", "fact-checking-retrieval"],
|
|
617
614
|
"Topic classification": ["topic-classification"],
|
|
618
615
|
"Code retrieval": [],
|
|
@@ -620,21 +617,21 @@ class TaskMetadata(BaseModel):
|
|
|
620
617
|
"Cross-Lingual Semantic Discrimination": [],
|
|
621
618
|
"Textual Entailment": ["natural-language-inference"],
|
|
622
619
|
"Counterfactual Detection": [],
|
|
623
|
-
"Emotion classification": [],
|
|
620
|
+
"Emotion classification": ["sentiment-classification"],
|
|
624
621
|
"Reasoning as Retrieval": [],
|
|
625
622
|
"Rendered Texts Understanding": [],
|
|
626
623
|
"Image Text Retrieval": [],
|
|
627
624
|
"Object recognition": [],
|
|
628
625
|
"Scene recognition": [],
|
|
629
626
|
"Caption Pairing": ["image-captioning"],
|
|
630
|
-
"Emotion recognition": [],
|
|
627
|
+
"Emotion recognition": ["sentiment-scoring"],
|
|
631
628
|
"Textures recognition": [],
|
|
632
629
|
"Activity recognition": [],
|
|
633
630
|
"Tumor detection": [],
|
|
634
631
|
"Duplicate Detection": [],
|
|
635
632
|
"Rendered semantic textual similarity": [
|
|
636
633
|
"semantic-similarity-scoring",
|
|
637
|
-
"
|
|
634
|
+
"semantic-similarity-classification",
|
|
638
635
|
],
|
|
639
636
|
"Intent classification": [
|
|
640
637
|
"intent-classification",
|
|
@@ -648,10 +645,8 @@ class TaskMetadata(BaseModel):
|
|
|
648
645
|
|
|
649
646
|
def _hf_task_type(self) -> list[str]:
|
|
650
647
|
# to get full list of task_types execute:
|
|
651
|
-
#
|
|
652
|
-
#
|
|
653
|
-
# }).json()
|
|
654
|
-
# or look at https://huggingface.co/tasks
|
|
648
|
+
# https://huggingface.co/api/datasets-tags-by-type?type=task_categories
|
|
649
|
+
# ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
|
|
655
650
|
mteb_task_type_to_datasets = {
|
|
656
651
|
# Text
|
|
657
652
|
"BitextMining": ["translation"],
|
|
@@ -670,7 +665,7 @@ class TaskMetadata(BaseModel):
|
|
|
670
665
|
"Any2AnyRetrieval": ["visual-document-retrieval"],
|
|
671
666
|
"Any2AnyMultilingualRetrieval": ["visual-document-retrieval"],
|
|
672
667
|
"VisionCentricQA": ["visual-question-answering"],
|
|
673
|
-
"ImageClustering": ["image-
|
|
668
|
+
"ImageClustering": ["image-feature-extraction"],
|
|
674
669
|
"ImageClassification": ["image-classification"],
|
|
675
670
|
"ImageMultilabelClassification": ["image-classification"],
|
|
676
671
|
"DocumentUnderstanding": ["visual-document-retrieval"],
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 102198,
|
|
4
|
+
"number_of_characters": 47870352,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 47719757,
|
|
7
|
+
"min_text_length": 9,
|
|
8
|
+
"average_text_length": 472.01951591046225,
|
|
9
|
+
"max_text_length": 8686,
|
|
10
|
+
"unique_texts": 101097
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 150595,
|
|
15
|
+
"min_text_length": 30,
|
|
16
|
+
"average_text_length": 136.78019981834694,
|
|
17
|
+
"max_text_length": 404,
|
|
18
|
+
"unique_texts": 1099
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 3401,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 3.089009990917348,
|
|
25
|
+
"max_relevant_docs_per_query": 5,
|
|
26
|
+
"unique_relevant_docs": 1123
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 132137,
|
|
4
|
+
"number_of_characters": 43323279,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 43311486,
|
|
7
|
+
"min_text_length": 11,
|
|
8
|
+
"average_text_length": 328.5778249819823,
|
|
9
|
+
"max_text_length": 8576,
|
|
10
|
+
"unique_texts": 131814
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 11793,
|
|
15
|
+
"min_text_length": 6,
|
|
16
|
+
"average_text_length": 36.62422360248447,
|
|
17
|
+
"max_text_length": 100,
|
|
18
|
+
"unique_texts": 321
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 11620,
|
|
23
|
+
"min_relevant_docs_per_query": 31,
|
|
24
|
+
"average_relevant_docs_per_query": 36.08695652173913,
|
|
25
|
+
"max_relevant_docs_per_query": 1288,
|
|
26
|
+
"unique_relevant_docs": 32537
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 106558,
|
|
4
|
+
"number_of_characters": 48164581,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 47886101,
|
|
7
|
+
"min_text_length": 9,
|
|
8
|
+
"average_text_length": 472.6783768310499,
|
|
9
|
+
"max_text_length": 8689,
|
|
10
|
+
"unique_texts": 101308
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 278480,
|
|
15
|
+
"min_text_length": 11,
|
|
16
|
+
"average_text_length": 53.04380952380952,
|
|
17
|
+
"max_text_length": 196,
|
|
18
|
+
"unique_texts": 5124
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 6254,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.1912380952380952,
|
|
25
|
+
"max_relevant_docs_per_query": 15,
|
|
26
|
+
"unique_relevant_docs": 1324
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 117974,
|
|
4
|
+
"number_of_characters": 35927363,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 35335613,
|
|
7
|
+
"min_text_length": 22,
|
|
8
|
+
"average_text_length": 316.47705838625023,
|
|
9
|
+
"max_text_length": 4105,
|
|
10
|
+
"unique_texts": 111651
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 591750,
|
|
15
|
+
"min_text_length": 21,
|
|
16
|
+
"average_text_length": 93.61651637399146,
|
|
17
|
+
"max_text_length": 280,
|
|
18
|
+
"unique_texts": 6321
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 12642,
|
|
23
|
+
"min_relevant_docs_per_query": 2,
|
|
24
|
+
"average_relevant_docs_per_query": 2.0,
|
|
25
|
+
"max_relevant_docs_per_query": 2,
|
|
26
|
+
"unique_relevant_docs": 11874
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"dev": {
|
|
3
|
+
"num_samples": 107153,
|
|
4
|
+
"number_of_characters": 33316879,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 33200903,
|
|
7
|
+
"min_text_length": 2,
|
|
8
|
+
"average_text_length": 320.30199218561575,
|
|
9
|
+
"max_text_length": 1712,
|
|
10
|
+
"unique_texts": 103641
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 115976,
|
|
15
|
+
"min_text_length": 8,
|
|
16
|
+
"average_text_length": 33.15494568324757,
|
|
17
|
+
"max_text_length": 190,
|
|
18
|
+
"unique_texts": 3498
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 3700,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.0577472841623785,
|
|
25
|
+
"max_relevant_docs_per_query": 4,
|
|
26
|
+
"unique_relevant_docs": 3698
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 104095,
|
|
4
|
+
"number_of_characters": 52312680,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 52220289,
|
|
7
|
+
"min_text_length": 10,
|
|
8
|
+
"average_text_length": 510.98673124908265,
|
|
9
|
+
"max_text_length": 10245,
|
|
10
|
+
"unique_texts": 102181
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 92391,
|
|
15
|
+
"min_text_length": 22,
|
|
16
|
+
"average_text_length": 48.62684210526316,
|
|
17
|
+
"max_text_length": 113,
|
|
18
|
+
"unique_texts": 1900
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 2283,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.201578947368421,
|
|
25
|
+
"max_relevant_docs_per_query": 4,
|
|
26
|
+
"unique_relevant_docs": 2283
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"test": {
|
|
3
|
+
"num_samples": 20561,
|
|
4
|
+
"number_of_characters": 10832770,
|
|
5
|
+
"documents_text_statistics": {
|
|
6
|
+
"total_text_length": 9929303,
|
|
7
|
+
"min_text_length": 9,
|
|
8
|
+
"average_text_length": 938.8524016641452,
|
|
9
|
+
"max_text_length": 6319,
|
|
10
|
+
"unique_texts": 10573
|
|
11
|
+
},
|
|
12
|
+
"documents_image_statistics": null,
|
|
13
|
+
"queries_text_statistics": {
|
|
14
|
+
"total_text_length": 903467,
|
|
15
|
+
"min_text_length": 13,
|
|
16
|
+
"average_text_length": 90.48242363545317,
|
|
17
|
+
"max_text_length": 228,
|
|
18
|
+
"unique_texts": 9985
|
|
19
|
+
},
|
|
20
|
+
"queries_image_statistics": null,
|
|
21
|
+
"relevant_docs_statistics": {
|
|
22
|
+
"num_relevant_docs": 11158,
|
|
23
|
+
"min_relevant_docs_per_query": 1,
|
|
24
|
+
"average_relevant_docs_per_query": 1.1174762143214823,
|
|
25
|
+
"max_relevant_docs_per_query": 8,
|
|
26
|
+
"unique_relevant_docs": 10576
|
|
27
|
+
},
|
|
28
|
+
"top_ranked_statistics": null
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -118,7 +118,7 @@ align_base = ModelMeta(
|
|
|
118
118
|
open_weights=True,
|
|
119
119
|
public_training_code="https://github.com/kakaobrain/coyo-align",
|
|
120
120
|
public_training_data=True,
|
|
121
|
-
framework=["PyTorch"],
|
|
121
|
+
framework=["PyTorch", "Transformers"],
|
|
122
122
|
reference="https://huggingface.co/kakaobrain/align-base",
|
|
123
123
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
124
124
|
use_instructions=False,
|
|
@@ -17,7 +17,7 @@ model2vecdk = ModelMeta(
|
|
|
17
17
|
embed_dim=256,
|
|
18
18
|
license="mit",
|
|
19
19
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
20
|
-
framework=["NumPy", "Sentence Transformers"],
|
|
20
|
+
framework=["NumPy", "Sentence Transformers", "safetensors"],
|
|
21
21
|
reference="https://huggingface.co/andersborges/model2vecdk",
|
|
22
22
|
use_instructions=False,
|
|
23
23
|
adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
|
|
@@ -48,7 +48,7 @@ model2vecdk_stem = ModelMeta(
|
|
|
48
48
|
embed_dim=256,
|
|
49
49
|
license="mit",
|
|
50
50
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
51
|
-
framework=["NumPy", "Sentence Transformers"],
|
|
51
|
+
framework=["NumPy", "Sentence Transformers", "safetensors"],
|
|
52
52
|
reference="https://huggingface.co/andersborges/model2vecdk",
|
|
53
53
|
use_instructions=False,
|
|
54
54
|
adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
|
|
@@ -16,7 +16,7 @@ arabic_triplet_matryoshka = ModelMeta(
|
|
|
16
16
|
max_tokens=768,
|
|
17
17
|
reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2",
|
|
18
18
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
19
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
19
|
+
framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
|
|
20
20
|
use_instructions=False,
|
|
21
21
|
public_training_code=None,
|
|
22
22
|
adapted_from="aubmindlab/bert-base-arabertv02",
|
|
@@ -145,7 +145,7 @@ arctic_embed_xs = ModelMeta(
|
|
|
145
145
|
release_date="2024-07-08", # initial commit of hf model.
|
|
146
146
|
languages=["eng-Latn"],
|
|
147
147
|
open_weights=True,
|
|
148
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
148
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
149
149
|
n_parameters=22_600_000,
|
|
150
150
|
memory_usage_mb=86,
|
|
151
151
|
max_tokens=512,
|
|
@@ -171,7 +171,7 @@ arctic_embed_s = ModelMeta(
|
|
|
171
171
|
release_date="2024-04-12", # initial commit of hf model.
|
|
172
172
|
languages=["eng-Latn"],
|
|
173
173
|
open_weights=True,
|
|
174
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
174
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
175
175
|
n_parameters=32_200_000,
|
|
176
176
|
memory_usage_mb=127,
|
|
177
177
|
max_tokens=512,
|
|
@@ -197,7 +197,7 @@ arctic_embed_m = ModelMeta(
|
|
|
197
197
|
release_date="2024-04-12", # initial commit of hf model.
|
|
198
198
|
languages=["eng-Latn"],
|
|
199
199
|
open_weights=True,
|
|
200
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
200
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
201
201
|
n_parameters=109_000_000,
|
|
202
202
|
memory_usage_mb=415,
|
|
203
203
|
max_tokens=512,
|
|
@@ -223,7 +223,7 @@ arctic_embed_m_long = ModelMeta(
|
|
|
223
223
|
release_date="2024-04-12", # initial commit of hf model.
|
|
224
224
|
languages=["eng-Latn"],
|
|
225
225
|
open_weights=True,
|
|
226
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
226
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
227
227
|
n_parameters=137_000_000,
|
|
228
228
|
memory_usage_mb=522,
|
|
229
229
|
max_tokens=2048,
|
|
@@ -248,7 +248,7 @@ arctic_embed_l = ModelMeta(
|
|
|
248
248
|
release_date="2024-04-12", # initial commit of hf model.
|
|
249
249
|
languages=["eng-Latn"],
|
|
250
250
|
open_weights=True,
|
|
251
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
251
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
252
252
|
n_parameters=335_000_000,
|
|
253
253
|
memory_usage_mb=1274,
|
|
254
254
|
max_tokens=512,
|
|
@@ -278,7 +278,7 @@ arctic_embed_m_v1_5 = ModelMeta(
|
|
|
278
278
|
release_date="2024-07-08", # initial commit of hf model.
|
|
279
279
|
languages=["eng-Latn"],
|
|
280
280
|
open_weights=True,
|
|
281
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
281
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors", "GGUF"],
|
|
282
282
|
n_parameters=109_000_000,
|
|
283
283
|
memory_usage_mb=415,
|
|
284
284
|
max_tokens=512,
|
|
@@ -304,7 +304,7 @@ arctic_embed_m_v2_0 = ModelMeta(
|
|
|
304
304
|
release_date="2024-12-04", # initial commit of hf model.
|
|
305
305
|
languages=LANGUAGES_V2_0,
|
|
306
306
|
open_weights=True,
|
|
307
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
307
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
308
308
|
n_parameters=305_000_000,
|
|
309
309
|
memory_usage_mb=1165,
|
|
310
310
|
max_tokens=8192,
|
|
@@ -329,7 +329,7 @@ arctic_embed_l_v2_0 = ModelMeta(
|
|
|
329
329
|
release_date="2024-12-04", # initial commit of hf model.
|
|
330
330
|
languages=LANGUAGES_V2_0,
|
|
331
331
|
open_weights=True,
|
|
332
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
332
|
+
framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
|
|
333
333
|
n_parameters=568_000_000,
|
|
334
334
|
memory_usage_mb=2166,
|
|
335
335
|
max_tokens=8192,
|
|
@@ -22,7 +22,7 @@ b1ade_embed = ModelMeta(
|
|
|
22
22
|
max_tokens=4096,
|
|
23
23
|
reference="https://huggingface.co/w601sxs/b1ade-embed",
|
|
24
24
|
similarity_fn_name=ScoringFunction.COSINE,
|
|
25
|
-
framework=["Sentence Transformers", "PyTorch"],
|
|
25
|
+
framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
|
|
26
26
|
use_instructions=False,
|
|
27
27
|
public_training_code=None,
|
|
28
28
|
public_training_data=None,
|