mteb 2.6.4__py3-none-any.whl → 2.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. mteb/abstasks/classification.py +2 -3
  2. mteb/abstasks/multilabel_classification.py +3 -3
  3. mteb/abstasks/regression.py +1 -1
  4. mteb/abstasks/retrieval.py +1 -1
  5. mteb/abstasks/task_metadata.py +9 -14
  6. mteb/descriptive_stats/Retrieval/NanoClimateFEVER-VN.json +30 -0
  7. mteb/descriptive_stats/Retrieval/NanoDBPedia-VN.json +30 -0
  8. mteb/descriptive_stats/Retrieval/NanoFEVER-VN.json +30 -0
  9. mteb/descriptive_stats/Retrieval/NanoHotpotQA-VN.json +30 -0
  10. mteb/descriptive_stats/Retrieval/NanoMSMARCO-VN.json +30 -0
  11. mteb/descriptive_stats/Retrieval/NanoNQ-VN.json +30 -0
  12. mteb/descriptive_stats/Retrieval/TVPLRetrieval.json +30 -0
  13. mteb/models/model_implementations/align_models.py +1 -1
  14. mteb/models/model_implementations/andersborges.py +2 -2
  15. mteb/models/model_implementations/ara_models.py +1 -1
  16. mteb/models/model_implementations/arctic_models.py +8 -8
  17. mteb/models/model_implementations/b1ade_models.py +1 -1
  18. mteb/models/model_implementations/bge_models.py +45 -21
  19. mteb/models/model_implementations/bica_model.py +3 -3
  20. mteb/models/model_implementations/blip2_models.py +2 -2
  21. mteb/models/model_implementations/blip_models.py +8 -8
  22. mteb/models/model_implementations/bmretriever_models.py +4 -4
  23. mteb/models/model_implementations/cadet_models.py +1 -1
  24. mteb/models/model_implementations/cde_models.py +2 -2
  25. mteb/models/model_implementations/clip_models.py +3 -3
  26. mteb/models/model_implementations/clips_models.py +3 -3
  27. mteb/models/model_implementations/codefuse_models.py +5 -5
  28. mteb/models/model_implementations/codesage_models.py +3 -3
  29. mteb/models/model_implementations/cohere_models.py +4 -4
  30. mteb/models/model_implementations/colpali_models.py +3 -3
  31. mteb/models/model_implementations/colqwen_models.py +8 -8
  32. mteb/models/model_implementations/colsmol_models.py +2 -2
  33. mteb/models/model_implementations/conan_models.py +1 -1
  34. mteb/models/model_implementations/dino_models.py +19 -19
  35. mteb/models/model_implementations/e5_instruct.py +23 -4
  36. mteb/models/model_implementations/e5_models.py +9 -9
  37. mteb/models/model_implementations/e5_v.py +1 -1
  38. mteb/models/model_implementations/eagerworks_models.py +1 -1
  39. mteb/models/model_implementations/emillykkejensen_models.py +3 -3
  40. mteb/models/model_implementations/en_code_retriever.py +1 -1
  41. mteb/models/model_implementations/euler_models.py +2 -2
  42. mteb/models/model_implementations/fa_models.py +9 -9
  43. mteb/models/model_implementations/facebookai.py +14 -2
  44. mteb/models/model_implementations/geogpt_models.py +1 -1
  45. mteb/models/model_implementations/gme_v_models.py +2 -2
  46. mteb/models/model_implementations/google_models.py +1 -1
  47. mteb/models/model_implementations/granite_vision_embedding_models.py +1 -1
  48. mteb/models/model_implementations/gritlm_models.py +2 -2
  49. mteb/models/model_implementations/gte_models.py +25 -13
  50. mteb/models/model_implementations/hinvec_models.py +1 -1
  51. mteb/models/model_implementations/ibm_granite_models.py +30 -6
  52. mteb/models/model_implementations/inf_models.py +2 -2
  53. mteb/models/model_implementations/jasper_models.py +2 -2
  54. mteb/models/model_implementations/jina_clip.py +1 -1
  55. mteb/models/model_implementations/jina_models.py +11 -5
  56. mteb/models/model_implementations/kblab.py +12 -6
  57. mteb/models/model_implementations/kennethenevoldsen_models.py +2 -2
  58. mteb/models/model_implementations/kfst.py +1 -1
  59. mteb/models/model_implementations/kowshik24_models.py +1 -1
  60. mteb/models/model_implementations/lgai_embedding_models.py +1 -1
  61. mteb/models/model_implementations/linq_models.py +1 -1
  62. mteb/models/model_implementations/listconranker.py +1 -1
  63. mteb/models/model_implementations/llm2clip_models.py +3 -3
  64. mteb/models/model_implementations/llm2vec_models.py +8 -8
  65. mteb/models/model_implementations/mdbr_models.py +14 -2
  66. mteb/models/model_implementations/misc_models.py +68 -68
  67. mteb/models/model_implementations/mme5_models.py +1 -1
  68. mteb/models/model_implementations/moco_models.py +2 -2
  69. mteb/models/model_implementations/mod_models.py +1 -1
  70. mteb/models/model_implementations/model2vec_models.py +13 -13
  71. mteb/models/model_implementations/moka_models.py +1 -1
  72. mteb/models/model_implementations/mxbai_models.py +16 -3
  73. mteb/models/model_implementations/nbailab.py +3 -3
  74. mteb/models/model_implementations/no_instruct_sentence_models.py +1 -1
  75. mteb/models/model_implementations/nomic_models.py +18 -6
  76. mteb/models/model_implementations/nomic_models_vision.py +1 -1
  77. mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +2 -2
  78. mteb/models/model_implementations/nvidia_models.py +3 -3
  79. mteb/models/model_implementations/octen_models.py +3 -3
  80. mteb/models/model_implementations/openclip_models.py +6 -6
  81. mteb/models/model_implementations/opensearch_neural_sparse_models.py +5 -5
  82. mteb/models/model_implementations/ops_moa_models.py +1 -1
  83. mteb/models/model_implementations/ordalietech_solon_embeddings_mini_beta_1_1.py +1 -1
  84. mteb/models/model_implementations/pawan_models.py +1 -1
  85. mteb/models/model_implementations/piccolo_models.py +1 -1
  86. mteb/models/model_implementations/promptriever_models.py +4 -4
  87. mteb/models/model_implementations/pylate_models.py +5 -5
  88. mteb/models/model_implementations/qodo_models.py +2 -2
  89. mteb/models/model_implementations/qtack_models.py +1 -1
  90. mteb/models/model_implementations/qwen3_models.py +3 -3
  91. mteb/models/model_implementations/qzhou_models.py +2 -2
  92. mteb/models/model_implementations/rasgaard_models.py +1 -1
  93. mteb/models/model_implementations/reasonir_model.py +1 -1
  94. mteb/models/model_implementations/repllama_models.py +1 -1
  95. mteb/models/model_implementations/rerankers_custom.py +9 -3
  96. mteb/models/model_implementations/rerankers_monot5_based.py +14 -14
  97. mteb/models/model_implementations/richinfoai_models.py +1 -1
  98. mteb/models/model_implementations/ru_sentence_models.py +20 -20
  99. mteb/models/model_implementations/ruri_models.py +10 -10
  100. mteb/models/model_implementations/salesforce_models.py +3 -3
  101. mteb/models/model_implementations/samilpwc_models.py +1 -1
  102. mteb/models/model_implementations/sarashina_embedding_models.py +2 -2
  103. mteb/models/model_implementations/searchmap_models.py +1 -1
  104. mteb/models/model_implementations/sentence_transformers_models.py +58 -22
  105. mteb/models/model_implementations/shuu_model.py +1 -1
  106. mteb/models/model_implementations/siglip_models.py +10 -10
  107. mteb/models/model_implementations/slm_models.py +416 -0
  108. mteb/models/model_implementations/spartan8806_atles_champion.py +1 -1
  109. mteb/models/model_implementations/stella_models.py +17 -4
  110. mteb/models/model_implementations/tarka_models.py +2 -2
  111. mteb/models/model_implementations/text2vec_models.py +9 -3
  112. mteb/models/model_implementations/ua_sentence_models.py +1 -1
  113. mteb/models/model_implementations/uae_models.py +7 -1
  114. mteb/models/model_implementations/vdr_models.py +1 -1
  115. mteb/models/model_implementations/vi_vn_models.py +6 -6
  116. mteb/models/model_implementations/vlm2vec_models.py +2 -2
  117. mteb/models/model_implementations/youtu_models.py +1 -1
  118. mteb/models/model_implementations/yuan_models.py +1 -1
  119. mteb/models/model_implementations/yuan_models_en.py +1 -1
  120. mteb/models/model_meta.py +46 -17
  121. mteb/results/benchmark_results.py +2 -2
  122. mteb/tasks/classification/kur/kurdish_sentiment_classification.py +2 -2
  123. mteb/tasks/clustering/eng/hume_wiki_cities_clustering.py +1 -1
  124. mteb/tasks/clustering/eng/wiki_cities_clustering.py +1 -1
  125. mteb/tasks/clustering/zho/cmteb_clustering.py +2 -2
  126. mteb/tasks/reranking/multilingual/wikipedia_reranking_multilingual.py +1 -1
  127. mteb/tasks/retrieval/eng/cub200_i2i_retrieval.py +1 -1
  128. mteb/tasks/retrieval/vie/__init__.py +14 -6
  129. mteb/tasks/retrieval/vie/climate_fevervn_retrieval.py +39 -0
  130. mteb/tasks/retrieval/vie/db_pedia_vn_retrieval.py +39 -0
  131. mteb/tasks/retrieval/vie/fevervn_retrieval.py +39 -0
  132. mteb/tasks/retrieval/vie/hotpot_qavn_retrieval.py +39 -0
  133. mteb/tasks/retrieval/vie/msmarcovn_retrieval.py +48 -0
  134. mteb/tasks/retrieval/vie/nqvn_retrieval.py +39 -0
  135. mteb/tasks/retrieval/vie/tvpl_retrieval.py +42 -0
  136. mteb/tasks/retrieval/vie/zac_legal_text_retrieval.py +15 -1
  137. {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/METADATA +3 -3
  138. {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/RECORD +142 -133
  139. {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/WHEEL +0 -0
  140. {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/entry_points.txt +0 -0
  141. {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/licenses/LICENSE +0 -0
  142. {mteb-2.6.4.dist-info → mteb-2.6.6.dist-info}/top_level.txt +0 -0
@@ -98,9 +98,8 @@ class AbsTaskClassification(AbsTask):
98
98
  text: str (for text) or PIL.Image (for image). Column name can be changed via `input_column_name` attribute.
99
99
  label: int. Column name can be changed via `label_column_name` attribute.
100
100
  evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LogisticRegression`.
101
- Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
102
- samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
103
- n_experiments: Number of experiments to run. Default is 10.
101
+ samples_per_label: Number of samples per label to use for training the evaluator model. Default is 8.
102
+ n_experiments: Number of experiments to run. Default is 10.
104
103
  train_split: Name of the split to use for training the evaluator model. Default is "train".
105
104
  label_column_name: Name of the column containing the labels. Default is "label".
106
105
  input_column_name: Name of the column containing the input data. Default is "text".
@@ -70,10 +70,10 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
70
70
  input_column_name: Name of the column containing the input text.
71
71
  label_column_name: Name of the column containing the labels.
72
72
  samples_per_label: Number of samples to use pr. label. These samples are embedded and a classifier is fit using the labels and samples.
73
- evaluator: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
73
+ evaluator_model: Classifier to use for evaluation. Must implement the SklearnModelProtocol.
74
74
  """
75
75
 
76
- evaluator: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5) # type: ignore[assignment]
76
+ evaluator_model: SklearnModelProtocol = KNeighborsClassifier(n_neighbors=5)
77
77
  input_column_name: str = "text"
78
78
  label_column_name: str = "label"
79
79
 
@@ -169,7 +169,7 @@ class AbsTaskMultilabelClassification(AbsTaskClassification):
169
169
  y_train = train_split.select(sample_indices)[self.label_column_name]
170
170
  y_train = binarizer.transform(y_train)
171
171
  y_pred, current_classifier = _evaluate_classifier(
172
- X_train, y_train, X_test, self.evaluator
172
+ X_train, y_train, X_test, self.evaluator_model
173
173
  )
174
174
  if prediction_folder:
175
175
  all_predictions.append(y_pred.tolist())
@@ -84,7 +84,7 @@ class AbsTaskRegression(AbsTaskClassification):
84
84
  n_samples: Number of samples to use for training the regression model. If the dataset has fewer samples than n_samples, all samples are used.
85
85
  abstask_prompt: Prompt to use for the task for instruction model if not prompt is provided in TaskMetadata.prompt.
86
86
  evaluator_model: The model to use for evaluation. Can be any sklearn compatible model. Default is `LinearRegression`.
87
- Full details of api in [`SklearnModelProtocol`][mteb._evaluators.sklearn_evaluator.SklearnModelProtocol].
87
+
88
88
  """
89
89
 
90
90
  evaluator: type[SklearnEvaluator] = SklearnEvaluator
@@ -285,7 +285,7 @@ class AbsTaskRetrieval(AbsTask):
285
285
  *,
286
286
  encode_kwargs: dict[str, Any],
287
287
  prediction_folder: Path | None = None,
288
- **kwargs,
288
+ **kwargs: Any,
289
289
  ) -> Mapping[HFSubset, ScoresDict]:
290
290
  """Evaluate the model on the retrieval task.
291
291
 
@@ -485,7 +485,6 @@ class TaskMetadata(BaseModel):
485
485
  dataset_type = [
486
486
  *self._hf_task_type(),
487
487
  *self._hf_task_category(),
488
- *self._hf_subtypes(),
489
488
  ]
490
489
  languages = self._hf_languages()
491
490
 
@@ -587,10 +586,8 @@ class TaskMetadata(BaseModel):
587
586
 
588
587
  def _hf_subtypes(self) -> list[str]:
589
588
  # to get full list of available task_ids execute
590
- # requests.post("https://huggingface.co/api/validate-yaml", json={
591
- # "content": "---\ntask_ids: 'test'\n---",
592
- # "repoType": "dataset"
593
- # })
589
+ # https://huggingface.co/api/datasets-tags-by-type?type=task_ids
590
+ # ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
594
591
  mteb_to_hf_subtype = {
595
592
  "Article retrieval": ["document-retrieval"],
596
593
  "Conversational retrieval": ["conversational", "utterance-retrieval"],
@@ -612,7 +609,7 @@ class TaskMetadata(BaseModel):
612
609
  "hate-speech-detection",
613
610
  ],
614
611
  "Thematic clustering": [],
615
- "Scientific Reranking": [],
612
+ "Scientific Reranking": ["text-scoring"],
616
613
  "Claim verification": ["fact-checking", "fact-checking-retrieval"],
617
614
  "Topic classification": ["topic-classification"],
618
615
  "Code retrieval": [],
@@ -620,21 +617,21 @@ class TaskMetadata(BaseModel):
620
617
  "Cross-Lingual Semantic Discrimination": [],
621
618
  "Textual Entailment": ["natural-language-inference"],
622
619
  "Counterfactual Detection": [],
623
- "Emotion classification": [],
620
+ "Emotion classification": ["sentiment-classification"],
624
621
  "Reasoning as Retrieval": [],
625
622
  "Rendered Texts Understanding": [],
626
623
  "Image Text Retrieval": [],
627
624
  "Object recognition": [],
628
625
  "Scene recognition": [],
629
626
  "Caption Pairing": ["image-captioning"],
630
- "Emotion recognition": [],
627
+ "Emotion recognition": ["sentiment-scoring"],
631
628
  "Textures recognition": [],
632
629
  "Activity recognition": [],
633
630
  "Tumor detection": [],
634
631
  "Duplicate Detection": [],
635
632
  "Rendered semantic textual similarity": [
636
633
  "semantic-similarity-scoring",
637
- "rendered semantic textual similarity",
634
+ "semantic-similarity-classification",
638
635
  ],
639
636
  "Intent classification": [
640
637
  "intent-classification",
@@ -648,10 +645,8 @@ class TaskMetadata(BaseModel):
648
645
 
649
646
  def _hf_task_type(self) -> list[str]:
650
647
  # to get full list of task_types execute:
651
- # requests.post("https://huggingface.co/api/validate-yaml", json={
652
- # "content": "---\ntask_categories: ['test']\n---", "repoType": "dataset"
653
- # }).json()
654
- # or look at https://huggingface.co/tasks
648
+ # https://huggingface.co/api/datasets-tags-by-type?type=task_categories
649
+ # ref https://huggingface-openapi.hf.space/#tag/datasets/GET/api/datasets-tags-by-type
655
650
  mteb_task_type_to_datasets = {
656
651
  # Text
657
652
  "BitextMining": ["translation"],
@@ -670,7 +665,7 @@ class TaskMetadata(BaseModel):
670
665
  "Any2AnyRetrieval": ["visual-document-retrieval"],
671
666
  "Any2AnyMultilingualRetrieval": ["visual-document-retrieval"],
672
667
  "VisionCentricQA": ["visual-question-answering"],
673
- "ImageClustering": ["image-clustering"],
668
+ "ImageClustering": ["image-feature-extraction"],
674
669
  "ImageClassification": ["image-classification"],
675
670
  "ImageMultilabelClassification": ["image-classification"],
676
671
  "DocumentUnderstanding": ["visual-document-retrieval"],
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 102198,
4
+ "number_of_characters": 47870352,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 47719757,
7
+ "min_text_length": 9,
8
+ "average_text_length": 472.01951591046225,
9
+ "max_text_length": 8686,
10
+ "unique_texts": 101097
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 150595,
15
+ "min_text_length": 30,
16
+ "average_text_length": 136.78019981834694,
17
+ "max_text_length": 404,
18
+ "unique_texts": 1099
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 3401,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 3.089009990917348,
25
+ "max_relevant_docs_per_query": 5,
26
+ "unique_relevant_docs": 1123
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 132137,
4
+ "number_of_characters": 43323279,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 43311486,
7
+ "min_text_length": 11,
8
+ "average_text_length": 328.5778249819823,
9
+ "max_text_length": 8576,
10
+ "unique_texts": 131814
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 11793,
15
+ "min_text_length": 6,
16
+ "average_text_length": 36.62422360248447,
17
+ "max_text_length": 100,
18
+ "unique_texts": 321
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 11620,
23
+ "min_relevant_docs_per_query": 31,
24
+ "average_relevant_docs_per_query": 36.08695652173913,
25
+ "max_relevant_docs_per_query": 1288,
26
+ "unique_relevant_docs": 32537
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 106558,
4
+ "number_of_characters": 48164581,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 47886101,
7
+ "min_text_length": 9,
8
+ "average_text_length": 472.6783768310499,
9
+ "max_text_length": 8689,
10
+ "unique_texts": 101308
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 278480,
15
+ "min_text_length": 11,
16
+ "average_text_length": 53.04380952380952,
17
+ "max_text_length": 196,
18
+ "unique_texts": 5124
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 6254,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.1912380952380952,
25
+ "max_relevant_docs_per_query": 15,
26
+ "unique_relevant_docs": 1324
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 117974,
4
+ "number_of_characters": 35927363,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 35335613,
7
+ "min_text_length": 22,
8
+ "average_text_length": 316.47705838625023,
9
+ "max_text_length": 4105,
10
+ "unique_texts": 111651
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 591750,
15
+ "min_text_length": 21,
16
+ "average_text_length": 93.61651637399146,
17
+ "max_text_length": 280,
18
+ "unique_texts": 6321
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 12642,
23
+ "min_relevant_docs_per_query": 2,
24
+ "average_relevant_docs_per_query": 2.0,
25
+ "max_relevant_docs_per_query": 2,
26
+ "unique_relevant_docs": 11874
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "dev": {
3
+ "num_samples": 107153,
4
+ "number_of_characters": 33316879,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 33200903,
7
+ "min_text_length": 2,
8
+ "average_text_length": 320.30199218561575,
9
+ "max_text_length": 1712,
10
+ "unique_texts": 103641
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 115976,
15
+ "min_text_length": 8,
16
+ "average_text_length": 33.15494568324757,
17
+ "max_text_length": 190,
18
+ "unique_texts": 3498
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 3700,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.0577472841623785,
25
+ "max_relevant_docs_per_query": 4,
26
+ "unique_relevant_docs": 3698
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 104095,
4
+ "number_of_characters": 52312680,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 52220289,
7
+ "min_text_length": 10,
8
+ "average_text_length": 510.98673124908265,
9
+ "max_text_length": 10245,
10
+ "unique_texts": 102181
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 92391,
15
+ "min_text_length": 22,
16
+ "average_text_length": 48.62684210526316,
17
+ "max_text_length": 113,
18
+ "unique_texts": 1900
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 2283,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.201578947368421,
25
+ "max_relevant_docs_per_query": 4,
26
+ "unique_relevant_docs": 2283
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "test": {
3
+ "num_samples": 20561,
4
+ "number_of_characters": 10832770,
5
+ "documents_text_statistics": {
6
+ "total_text_length": 9929303,
7
+ "min_text_length": 9,
8
+ "average_text_length": 938.8524016641452,
9
+ "max_text_length": 6319,
10
+ "unique_texts": 10573
11
+ },
12
+ "documents_image_statistics": null,
13
+ "queries_text_statistics": {
14
+ "total_text_length": 903467,
15
+ "min_text_length": 13,
16
+ "average_text_length": 90.48242363545317,
17
+ "max_text_length": 228,
18
+ "unique_texts": 9985
19
+ },
20
+ "queries_image_statistics": null,
21
+ "relevant_docs_statistics": {
22
+ "num_relevant_docs": 11158,
23
+ "min_relevant_docs_per_query": 1,
24
+ "average_relevant_docs_per_query": 1.1174762143214823,
25
+ "max_relevant_docs_per_query": 8,
26
+ "unique_relevant_docs": 10576
27
+ },
28
+ "top_ranked_statistics": null
29
+ }
30
+ }
@@ -118,7 +118,7 @@ align_base = ModelMeta(
118
118
  open_weights=True,
119
119
  public_training_code="https://github.com/kakaobrain/coyo-align",
120
120
  public_training_data=True,
121
- framework=["PyTorch"],
121
+ framework=["PyTorch", "Transformers"],
122
122
  reference="https://huggingface.co/kakaobrain/align-base",
123
123
  similarity_fn_name=ScoringFunction.COSINE,
124
124
  use_instructions=False,
@@ -17,7 +17,7 @@ model2vecdk = ModelMeta(
17
17
  embed_dim=256,
18
18
  license="mit",
19
19
  similarity_fn_name=ScoringFunction.COSINE,
20
- framework=["NumPy", "Sentence Transformers"],
20
+ framework=["NumPy", "Sentence Transformers", "safetensors"],
21
21
  reference="https://huggingface.co/andersborges/model2vecdk",
22
22
  use_instructions=False,
23
23
  adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
@@ -48,7 +48,7 @@ model2vecdk_stem = ModelMeta(
48
48
  embed_dim=256,
49
49
  license="mit",
50
50
  similarity_fn_name=ScoringFunction.COSINE,
51
- framework=["NumPy", "Sentence Transformers"],
51
+ framework=["NumPy", "Sentence Transformers", "safetensors"],
52
52
  reference="https://huggingface.co/andersborges/model2vecdk",
53
53
  use_instructions=False,
54
54
  adapted_from="https://huggingface.co/jealk/TTC-L2V-supervised-2",
@@ -16,7 +16,7 @@ arabic_triplet_matryoshka = ModelMeta(
16
16
  max_tokens=768,
17
17
  reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2",
18
18
  similarity_fn_name=ScoringFunction.COSINE,
19
- framework=["Sentence Transformers", "PyTorch"],
19
+ framework=["Sentence Transformers", "PyTorch", "safetensors", "Transformers"],
20
20
  use_instructions=False,
21
21
  public_training_code=None,
22
22
  adapted_from="aubmindlab/bert-base-arabertv02",
@@ -145,7 +145,7 @@ arctic_embed_xs = ModelMeta(
145
145
  release_date="2024-07-08", # initial commit of hf model.
146
146
  languages=["eng-Latn"],
147
147
  open_weights=True,
148
- framework=["Sentence Transformers", "PyTorch"],
148
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
149
149
  n_parameters=22_600_000,
150
150
  memory_usage_mb=86,
151
151
  max_tokens=512,
@@ -171,7 +171,7 @@ arctic_embed_s = ModelMeta(
171
171
  release_date="2024-04-12", # initial commit of hf model.
172
172
  languages=["eng-Latn"],
173
173
  open_weights=True,
174
- framework=["Sentence Transformers", "PyTorch"],
174
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
175
175
  n_parameters=32_200_000,
176
176
  memory_usage_mb=127,
177
177
  max_tokens=512,
@@ -197,7 +197,7 @@ arctic_embed_m = ModelMeta(
197
197
  release_date="2024-04-12", # initial commit of hf model.
198
198
  languages=["eng-Latn"],
199
199
  open_weights=True,
200
- framework=["Sentence Transformers", "PyTorch"],
200
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
201
201
  n_parameters=109_000_000,
202
202
  memory_usage_mb=415,
203
203
  max_tokens=512,
@@ -223,7 +223,7 @@ arctic_embed_m_long = ModelMeta(
223
223
  release_date="2024-04-12", # initial commit of hf model.
224
224
  languages=["eng-Latn"],
225
225
  open_weights=True,
226
- framework=["Sentence Transformers", "PyTorch"],
226
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
227
227
  n_parameters=137_000_000,
228
228
  memory_usage_mb=522,
229
229
  max_tokens=2048,
@@ -248,7 +248,7 @@ arctic_embed_l = ModelMeta(
248
248
  release_date="2024-04-12", # initial commit of hf model.
249
249
  languages=["eng-Latn"],
250
250
  open_weights=True,
251
- framework=["Sentence Transformers", "PyTorch"],
251
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
252
252
  n_parameters=335_000_000,
253
253
  memory_usage_mb=1274,
254
254
  max_tokens=512,
@@ -278,7 +278,7 @@ arctic_embed_m_v1_5 = ModelMeta(
278
278
  release_date="2024-07-08", # initial commit of hf model.
279
279
  languages=["eng-Latn"],
280
280
  open_weights=True,
281
- framework=["Sentence Transformers", "PyTorch"],
281
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors", "GGUF"],
282
282
  n_parameters=109_000_000,
283
283
  memory_usage_mb=415,
284
284
  max_tokens=512,
@@ -304,7 +304,7 @@ arctic_embed_m_v2_0 = ModelMeta(
304
304
  release_date="2024-12-04", # initial commit of hf model.
305
305
  languages=LANGUAGES_V2_0,
306
306
  open_weights=True,
307
- framework=["Sentence Transformers", "PyTorch"],
307
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
308
308
  n_parameters=305_000_000,
309
309
  memory_usage_mb=1165,
310
310
  max_tokens=8192,
@@ -329,7 +329,7 @@ arctic_embed_l_v2_0 = ModelMeta(
329
329
  release_date="2024-12-04", # initial commit of hf model.
330
330
  languages=LANGUAGES_V2_0,
331
331
  open_weights=True,
332
- framework=["Sentence Transformers", "PyTorch"],
332
+ framework=["Sentence Transformers", "PyTorch", "ONNX", "safetensors"],
333
333
  n_parameters=568_000_000,
334
334
  memory_usage_mb=2166,
335
335
  max_tokens=8192,
@@ -22,7 +22,7 @@ b1ade_embed = ModelMeta(
22
22
  max_tokens=4096,
23
23
  reference="https://huggingface.co/w601sxs/b1ade-embed",
24
24
  similarity_fn_name=ScoringFunction.COSINE,
25
- framework=["Sentence Transformers", "PyTorch"],
25
+ framework=["Sentence Transformers", "PyTorch", "Transformers", "safetensors"],
26
26
  use_instructions=False,
27
27
  public_training_code=None,
28
28
  public_training_data=None,