mteb 2.2.2__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mteb/__init__.py +4 -0
- mteb/descriptive_stats/Reranking/MultiLongDocReranking.json +466 -0
- mteb/evaluate.py +38 -7
- mteb/models/__init__.py +4 -1
- mteb/models/cache_wrappers/__init__.py +2 -1
- mteb/models/model_implementations/colpali_models.py +4 -4
- mteb/models/model_implementations/colqwen_models.py +206 -2
- mteb/models/model_implementations/eagerworks_models.py +163 -0
- mteb/models/model_implementations/euler_models.py +25 -0
- mteb/models/model_implementations/google_models.py +1 -1
- mteb/models/model_implementations/jina_models.py +203 -5
- mteb/models/model_implementations/nb_sbert.py +1 -1
- mteb/models/model_implementations/nvidia_llama_nemoretriever_colemb.py +10 -11
- mteb/models/model_implementations/nvidia_models.py +1 -1
- mteb/models/model_implementations/ops_moa_models.py +2 -2
- mteb/models/model_implementations/promptriever_models.py +4 -4
- mteb/models/model_implementations/qwen3_models.py +3 -3
- mteb/models/model_implementations/qzhou_models.py +1 -1
- mteb/models/model_implementations/random_baseline.py +8 -18
- mteb/models/model_implementations/vdr_models.py +1 -0
- mteb/models/model_implementations/yuan_models_en.py +57 -0
- mteb/models/search_encoder_index/__init__.py +7 -0
- mteb/models/search_encoder_index/search_backend_protocol.py +50 -0
- mteb/models/search_encoder_index/search_indexes/__init__.py +5 -0
- mteb/models/search_encoder_index/search_indexes/faiss_search_index.py +157 -0
- mteb/models/search_wrappers.py +157 -41
- mteb/results/model_result.py +2 -1
- mteb/results/task_result.py +12 -0
- mteb/similarity_functions.py +49 -0
- mteb/tasks/reranking/multilingual/__init__.py +2 -0
- mteb/tasks/reranking/multilingual/multi_long_doc_reranking.py +70 -0
- mteb/tasks/retrieval/eng/vidore_bench_retrieval.py +4 -0
- mteb/tasks/retrieval/multilingual/jina_vdr_bench_retrieval.py +56 -42
- mteb/tasks/retrieval/multilingual/vidore2_bench_retrieval.py +3 -3
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/METADATA +6 -1
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/RECORD +40 -31
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/WHEEL +0 -0
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/entry_points.txt +0 -0
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/licenses/LICENSE +0 -0
- {mteb-2.2.2.dist-info → mteb-2.3.1.dist-info}/top_level.txt +0 -0
|
@@ -165,7 +165,7 @@ def load_data(self) -> None:
|
|
|
165
165
|
class JinaVDRMedicalPrescriptionsRetrieval(AbsTaskRetrieval):
|
|
166
166
|
metadata = TaskMetadata(
|
|
167
167
|
name="JinaVDRMedicalPrescriptionsRetrieval",
|
|
168
|
-
description="Retrieve medical prescriptions based on templated queries.",
|
|
168
|
+
description="Retrieve medical prescriptions based on templated queries. Source dataset https://huggingface.co/datasets/Technoculture/medical-prescriptions",
|
|
169
169
|
reference="https://huggingface.co/datasets/jinaai/medical-prescriptions_beir",
|
|
170
170
|
dataset={
|
|
171
171
|
"path": "jinaai/medical-prescriptions_beir",
|
|
@@ -186,7 +186,7 @@ class JinaVDRMedicalPrescriptionsRetrieval(AbsTaskRetrieval):
|
|
|
186
186
|
class JinaVDRStanfordSlideRetrieval(AbsTaskRetrieval):
|
|
187
187
|
metadata = TaskMetadata(
|
|
188
188
|
name="JinaVDRStanfordSlideRetrieval",
|
|
189
|
-
description="Retrieve scientific and engineering slides based on human annotated queries.",
|
|
189
|
+
description="Retrieve scientific and engineering slides based on human annotated queries. Source dataset https://exhibits.stanford.edu/data/catalog/mv327tb8364",
|
|
190
190
|
reference="https://huggingface.co/datasets/jinaai/stanford_slide_beir",
|
|
191
191
|
dataset={
|
|
192
192
|
"path": "jinaai/stanford_slide_beir",
|
|
@@ -207,7 +207,7 @@ class JinaVDRStanfordSlideRetrieval(AbsTaskRetrieval):
|
|
|
207
207
|
class JinaVDRDonutVQAISynHMPRetrieval(AbsTaskRetrieval):
|
|
208
208
|
metadata = TaskMetadata(
|
|
209
209
|
name="JinaVDRDonutVQAISynHMPRetrieval",
|
|
210
|
-
description="Retrieve medical records based on templated queries.",
|
|
210
|
+
description="Retrieve medical records based on templated queries. Source dataset https://huggingface.co/datasets/warshakhan/donut_vqa_ISynHMP",
|
|
211
211
|
reference="https://huggingface.co/datasets/jinaai/donut_vqa_beir",
|
|
212
212
|
dataset={
|
|
213
213
|
"path": "jinaai/donut_vqa_beir",
|
|
@@ -228,7 +228,7 @@ class JinaVDRDonutVQAISynHMPRetrieval(AbsTaskRetrieval):
|
|
|
228
228
|
class JinaVDRTableVQARetrieval(AbsTaskRetrieval):
|
|
229
229
|
metadata = TaskMetadata(
|
|
230
230
|
name="JinaVDRTableVQARetrieval",
|
|
231
|
-
description="Retrieve scientific tables based on LLM generated queries.",
|
|
231
|
+
description="Retrieve scientific tables based on LLM generated queries. Source datasets https://huggingface.co/datasets/HuggingFaceM4/ChartQA or https://huggingface.co/datasets/cmarkea/aftdb",
|
|
232
232
|
reference="https://huggingface.co/datasets/jinaai/table-vqa_beir",
|
|
233
233
|
dataset={
|
|
234
234
|
"path": "jinaai/table-vqa_beir",
|
|
@@ -249,7 +249,7 @@ class JinaVDRTableVQARetrieval(AbsTaskRetrieval):
|
|
|
249
249
|
class JinaVDRChartQARetrieval(AbsTaskRetrieval):
|
|
250
250
|
metadata = TaskMetadata(
|
|
251
251
|
name="JinaVDRChartQARetrieval",
|
|
252
|
-
description="Retrieve charts based on LLM generated queries.",
|
|
252
|
+
description="Retrieve charts based on LLM generated queries. Source datasets https://huggingface.co/datasets/HuggingFaceM4/ChartQA",
|
|
253
253
|
reference="https://huggingface.co/datasets/jinaai/ChartQA_beir",
|
|
254
254
|
dataset={
|
|
255
255
|
"path": "jinaai/ChartQA_beir",
|
|
@@ -270,7 +270,7 @@ class JinaVDRChartQARetrieval(AbsTaskRetrieval):
|
|
|
270
270
|
class JinaVDRTQARetrieval(AbsTaskRetrieval):
|
|
271
271
|
metadata = TaskMetadata(
|
|
272
272
|
name="JinaVDRTQARetrieval",
|
|
273
|
-
description="Retrieve textbook pages (images and text) based on LLM generated queries from the text.",
|
|
273
|
+
description="Retrieve textbook pages (images and text) based on LLM generated queries from the text. Source datasets https://prior.allenai.org/projects/tqa",
|
|
274
274
|
reference="https://huggingface.co/datasets/jinaai/tqa_beir",
|
|
275
275
|
dataset={
|
|
276
276
|
"path": "jinaai/tqa_beir",
|
|
@@ -291,7 +291,7 @@ class JinaVDRTQARetrieval(AbsTaskRetrieval):
|
|
|
291
291
|
class JinaVDROpenAINewsRetrieval(AbsTaskRetrieval):
|
|
292
292
|
metadata = TaskMetadata(
|
|
293
293
|
name="JinaVDROpenAINewsRetrieval",
|
|
294
|
-
description="Retrieve news articles from the OpenAI news website based on human annotated queries.",
|
|
294
|
+
description="Retrieve news articles from the OpenAI news website based on human annotated queries. News taken from https://openai.com/news/",
|
|
295
295
|
reference="https://huggingface.co/datasets/jinaai/openai-news_beir",
|
|
296
296
|
dataset={
|
|
297
297
|
"path": "jinaai/openai-news_beir",
|
|
@@ -312,7 +312,7 @@ class JinaVDROpenAINewsRetrieval(AbsTaskRetrieval):
|
|
|
312
312
|
class JinaVDREuropeanaDeNewsRetrieval(AbsTaskRetrieval):
|
|
313
313
|
metadata = TaskMetadata(
|
|
314
314
|
name="JinaVDREuropeanaDeNewsRetrieval",
|
|
315
|
-
description="Retrieve German news articles based on LLM generated queries.",
|
|
315
|
+
description="Retrieve German news articles based on LLM generated queries. This dataset was created from records of the [Europeana](https://europeana.eu/) online collection by selecting scans of German news articles",
|
|
316
316
|
reference="https://huggingface.co/datasets/jinaai/europeana-de-news_beir",
|
|
317
317
|
dataset={
|
|
318
318
|
"path": "jinaai/europeana-de-news_beir",
|
|
@@ -333,7 +333,7 @@ class JinaVDREuropeanaDeNewsRetrieval(AbsTaskRetrieval):
|
|
|
333
333
|
class JinaVDREuropeanaEsNewsRetrieval(AbsTaskRetrieval):
|
|
334
334
|
metadata = TaskMetadata(
|
|
335
335
|
name="JinaVDREuropeanaEsNewsRetrieval",
|
|
336
|
-
description="Retrieve Spanish news articles based on LLM generated queries.",
|
|
336
|
+
description="Retrieve Spanish news articles based on LLM generated queries. This dataset was created from records of the [Europeana](https://europeana.eu/) online collection by selecting scans of Spanish news articles",
|
|
337
337
|
reference="https://huggingface.co/datasets/jinaai/europeana-es-news_beir",
|
|
338
338
|
dataset={
|
|
339
339
|
"path": "jinaai/europeana-es-news_beir",
|
|
@@ -354,7 +354,7 @@ class JinaVDREuropeanaEsNewsRetrieval(AbsTaskRetrieval):
|
|
|
354
354
|
class JinaVDREuropeanaItScansRetrieval(AbsTaskRetrieval):
|
|
355
355
|
metadata = TaskMetadata(
|
|
356
356
|
name="JinaVDREuropeanaItScansRetrieval",
|
|
357
|
-
description="Retrieve Italian historical articles based on LLM generated queries.",
|
|
357
|
+
description="Retrieve Italian historical articles based on LLM generated queries. This dataset was created from records of the [Europeana](https://europeana.eu/) online collection by selecting scans of Italian news articles",
|
|
358
358
|
reference="https://huggingface.co/datasets/jinaai/europeana-it-scans_beir",
|
|
359
359
|
dataset={
|
|
360
360
|
"path": "jinaai/europeana-it-scans_beir",
|
|
@@ -375,7 +375,7 @@ class JinaVDREuropeanaItScansRetrieval(AbsTaskRetrieval):
|
|
|
375
375
|
class JinaVDREuropeanaNlLegalRetrieval(AbsTaskRetrieval):
|
|
376
376
|
metadata = TaskMetadata(
|
|
377
377
|
name="JinaVDREuropeanaNlLegalRetrieval",
|
|
378
|
-
description="Retrieve Dutch historical legal documents based on LLM generated queries.",
|
|
378
|
+
description="Retrieve Dutch historical legal documents based on LLM generated queries. This dataset was created from records of the [Europeana](https://europeana.eu/) online collection by selecting scans of Dutch news articles",
|
|
379
379
|
reference="https://huggingface.co/datasets/jinaai/europeana-nl-legal_beir",
|
|
380
380
|
dataset={
|
|
381
381
|
"path": "jinaai/europeana-nl-legal_beir",
|
|
@@ -417,7 +417,7 @@ class JinaVDRHindiGovVQARetrieval(AbsTaskRetrieval):
|
|
|
417
417
|
class JinaVDRAutomobileCatelogRetrieval(AbsTaskRetrieval):
|
|
418
418
|
metadata = TaskMetadata(
|
|
419
419
|
name="JinaVDRAutomobileCatelogRetrieval",
|
|
420
|
-
description="Retrieve automobile marketing documents based on LLM generated queries.",
|
|
420
|
+
description="Retrieve automobile marketing documents based on LLM generated queries. Marketing document from Toyota Japanese website featuring [RAV4](https://toyota.jp/pages/contents/request/webcatalog/rav4/rav4_special1_202310.pdf) and [Corolla](https://toyota.jp/pages/contents/request/webcatalog/corolla/corolla_special1_202407.pdf). The `text_description` column contains OCR text extracted from the images using EasyOCR.",
|
|
421
421
|
reference="https://huggingface.co/datasets/jinaai/automobile_catalogue_jp_beir",
|
|
422
422
|
dataset={
|
|
423
423
|
"path": "jinaai/automobile_catalogue_jp_beir",
|
|
@@ -438,7 +438,7 @@ class JinaVDRAutomobileCatelogRetrieval(AbsTaskRetrieval):
|
|
|
438
438
|
class JinaVDRBeveragesCatalogueRetrieval(AbsTaskRetrieval):
|
|
439
439
|
metadata = TaskMetadata(
|
|
440
440
|
name="JinaVDRBeveragesCatalogueRetrieval",
|
|
441
|
-
description="Retrieve beverages marketing documents based on LLM generated queries.",
|
|
441
|
+
description="Retrieve beverages marketing documents based on LLM generated queries. This dataset was self-curated by searching beverage catalogs on Google search and downloading PDFs.",
|
|
442
442
|
reference="https://huggingface.co/datasets/jinaai/beverages_catalogue_ru_beir",
|
|
443
443
|
dataset={
|
|
444
444
|
"path": "jinaai/beverages_catalogue_ru_beir",
|
|
@@ -459,7 +459,7 @@ class JinaVDRBeveragesCatalogueRetrieval(AbsTaskRetrieval):
|
|
|
459
459
|
class JinaVDRRamensBenchmarkRetrieval(AbsTaskRetrieval):
|
|
460
460
|
metadata = TaskMetadata(
|
|
461
461
|
name="JinaVDRRamensBenchmarkRetrieval",
|
|
462
|
-
description="Retrieve ramen restaurant marketing documents based on LLM generated queries.",
|
|
462
|
+
description="Retrieve ramen restaurant marketing documents based on LLM generated queries. Marketing document from Ramen [restaurants](https://www.city.niigata.lg.jp/kanko/kanko/oshirase/ramen.files/guidebook.pdf).",
|
|
463
463
|
reference="https://huggingface.co/datasets/jinaai/ramen_benchmark_jp_beir",
|
|
464
464
|
dataset={
|
|
465
465
|
"path": "jinaai/ramen_benchmark_jp_beir",
|
|
@@ -480,7 +480,7 @@ class JinaVDRRamensBenchmarkRetrieval(AbsTaskRetrieval):
|
|
|
480
480
|
class JinaVDRJDocQARetrieval(AbsTaskRetrieval):
|
|
481
481
|
metadata = TaskMetadata(
|
|
482
482
|
name="JinaVDRJDocQARetrieval",
|
|
483
|
-
description="Retrieve Japanese documents in various formats based on human annotated queries.",
|
|
483
|
+
description="Retrieve Japanese documents in various formats based on human annotated queries. Document Question answering from [JDocQAJP dataset](https://huggingface.co/datasets/jlli/JDocQA-nonbinary), test split.",
|
|
484
484
|
reference="https://huggingface.co/datasets/jinaai/jdocqa_beir",
|
|
485
485
|
dataset={
|
|
486
486
|
"path": "jinaai/jdocqa_beir",
|
|
@@ -501,7 +501,7 @@ class JinaVDRJDocQARetrieval(AbsTaskRetrieval):
|
|
|
501
501
|
class JinaVDRHungarianDocQARetrieval(AbsTaskRetrieval):
|
|
502
502
|
metadata = TaskMetadata(
|
|
503
503
|
name="JinaVDRHungarianDocQARetrieval",
|
|
504
|
-
description="Retrieve Hungarian documents in various formats based on human annotated queries.",
|
|
504
|
+
description="Retrieve Hungarian documents in various formats based on human annotated queries. Document Question answering from [Hungurian doc qa dataset](https://huggingface.co/datasets/jlli/HungarianDocQA-OCR), test split.",
|
|
505
505
|
reference="https://huggingface.co/datasets/jinaai/hungarian_doc_qa_beir",
|
|
506
506
|
dataset={
|
|
507
507
|
"path": "jinaai/hungarian_doc_qa_beir",
|
|
@@ -522,7 +522,7 @@ class JinaVDRHungarianDocQARetrieval(AbsTaskRetrieval):
|
|
|
522
522
|
class JinaVDRArabicChartQARetrieval(AbsTaskRetrieval):
|
|
523
523
|
metadata = TaskMetadata(
|
|
524
524
|
name="JinaVDRArabicChartQARetrieval",
|
|
525
|
-
description="Retrieve Arabic charts based on queries.",
|
|
525
|
+
description="Retrieve Arabic charts based on queries. This dataset is derived from the [Arabic ChartQA dataset](https://huggingface.co/datasets/ahmedheakl/arabic_chartqa), reformatting the train split as a test split with modified field names such that it is compatible with the ViDoRe evaluation benchmark.",
|
|
526
526
|
reference="https://huggingface.co/datasets/jinaai/arabic_chartqa_ar_beir",
|
|
527
527
|
dataset={
|
|
528
528
|
"path": "jinaai/arabic_chartqa_ar_beir",
|
|
@@ -543,7 +543,7 @@ class JinaVDRArabicChartQARetrieval(AbsTaskRetrieval):
|
|
|
543
543
|
class JinaVDRArabicInfographicsVQARetrieval(AbsTaskRetrieval):
|
|
544
544
|
metadata = TaskMetadata(
|
|
545
545
|
name="JinaVDRArabicInfographicsVQARetrieval",
|
|
546
|
-
description="Retrieve Arabic infographics based on queries.",
|
|
546
|
+
description="Retrieve Arabic infographics based on queries. This dataset is derived from the [Arabic Infographics VQA dataset](https://huggingface.co/datasets/ahmedheakl/arabic_infographicsvqa), reformatting the train split as a test split with modified field names so it can be used in the ViDoRe evaluation benchmark.",
|
|
547
547
|
reference="https://huggingface.co/datasets/jinaai/arabic_infographicsvqa_ar_beir",
|
|
548
548
|
dataset={
|
|
549
549
|
"path": "jinaai/arabic_infographicsvqa_ar_beir",
|
|
@@ -564,7 +564,7 @@ class JinaVDRArabicInfographicsVQARetrieval(AbsTaskRetrieval):
|
|
|
564
564
|
class JinaVDROWIDChartsRetrieval(AbsTaskRetrieval):
|
|
565
565
|
metadata = TaskMetadata(
|
|
566
566
|
name="JinaVDROWIDChartsRetrieval",
|
|
567
|
-
description="Retrieve charts from the OWID dataset based on accompanied text snippets.",
|
|
567
|
+
description="Retrieve charts from the OWID dataset based on accompanied text snippets. We sampled a set of ~5k charts and articles from [Our World In Data](https://ourworldindata.org) to produce this evaluation set. This particular dataset is a subsample of 1000 random charts from the full dataset which can be found [here](https://huggingface.co/datasets/jjinaai/owid_charts).",
|
|
568
568
|
reference="https://huggingface.co/datasets/jinaai/owid_charts_en_beir",
|
|
569
569
|
dataset={
|
|
570
570
|
"path": "jinaai/owid_charts_en_beir",
|
|
@@ -585,7 +585,7 @@ class JinaVDROWIDChartsRetrieval(AbsTaskRetrieval):
|
|
|
585
585
|
class JinaVDRMPMQARetrieval(AbsTaskRetrieval):
|
|
586
586
|
metadata = TaskMetadata(
|
|
587
587
|
name="JinaVDRMPMQARetrieval",
|
|
588
|
-
description="Retrieve product manuals based on human annotated queries.",
|
|
588
|
+
description="Retrieve product manuals based on human annotated queries. 155 questions and 782 document images cleaned from [jinaai/MPMQA](https://huggingface.co/datasets/jinaai/MPMQA), test set.", # MPMQA not exists on HF
|
|
589
589
|
reference="https://huggingface.co/datasets/jinaai/mpmqa_small_beir",
|
|
590
590
|
dataset={
|
|
591
591
|
"path": "jinaai/mpmqa_small_beir",
|
|
@@ -606,7 +606,7 @@ class JinaVDRMPMQARetrieval(AbsTaskRetrieval):
|
|
|
606
606
|
class JinaVDRJina2024YearlyBookRetrieval(AbsTaskRetrieval):
|
|
607
607
|
metadata = TaskMetadata(
|
|
608
608
|
name="JinaVDRJina2024YearlyBookRetrieval",
|
|
609
|
-
description="Retrieve pages from the 2024 Jina yearbook based on human annotated questions.",
|
|
609
|
+
description="Retrieve pages from the 2024 Jina yearbook based on human annotated questions. 75 human annotated questions created from digital version of Jina AI yearly book 2024, 166 pages in total. ",
|
|
610
610
|
reference="https://huggingface.co/datasets/jinaai/jina_2024_yearly_book_beir",
|
|
611
611
|
dataset={
|
|
612
612
|
"path": "jinaai/jina_2024_yearly_book_beir",
|
|
@@ -627,7 +627,7 @@ class JinaVDRJina2024YearlyBookRetrieval(AbsTaskRetrieval):
|
|
|
627
627
|
class JinaVDRWikimediaCommonsMapsRetrieval(AbsTaskRetrieval):
|
|
628
628
|
metadata = TaskMetadata(
|
|
629
629
|
name="JinaVDRWikimediaCommonsMapsRetrieval",
|
|
630
|
-
description="Retrieve maps from Wikimedia Commons based on their description.",
|
|
630
|
+
description="Retrieve maps from Wikimedia Commons based on their description. It contains images of (mostly historic) maps which should be identified based on their description. We extracted those descriptions from [Wikimedia Commons](https://commons.wikimedia.org/). We have included the license type and a link (license_text) to the original Wikimedia Commons page for each extracted image.",
|
|
631
631
|
reference="https://huggingface.co/datasets/jinaai/wikimedia-commons-maps_beir",
|
|
632
632
|
dataset={
|
|
633
633
|
"path": "jinaai/wikimedia-commons-maps_beir",
|
|
@@ -648,7 +648,7 @@ class JinaVDRWikimediaCommonsMapsRetrieval(AbsTaskRetrieval):
|
|
|
648
648
|
class JinaVDRPlotQARetrieval(AbsTaskRetrieval):
|
|
649
649
|
metadata = TaskMetadata(
|
|
650
650
|
name="JinaVDRPlotQARetrieval",
|
|
651
|
-
description="Retrieve plots from the PlotQA dataset based on LLM generated queries.",
|
|
651
|
+
description="Retrieve plots from the PlotQA dataset based on LLM generated queries. Questions subsampled from [PlotQA](https://github.com/NiteshMethani/PlotQA) test set. It is following a subsample + LLM-based classification process, using LLM to verify the question quality, e.g. queries like `How many different coloured dotlines are there` will be filtered out.",
|
|
652
652
|
reference="https://huggingface.co/datasets/jinaai/plotqa_beir",
|
|
653
653
|
dataset={
|
|
654
654
|
"path": "jinaai/plotqa_beir",
|
|
@@ -669,7 +669,7 @@ class JinaVDRPlotQARetrieval(AbsTaskRetrieval):
|
|
|
669
669
|
class JinaVDRMMTabRetrieval(AbsTaskRetrieval):
|
|
670
670
|
metadata = TaskMetadata(
|
|
671
671
|
name="JinaVDRMMTabRetrieval",
|
|
672
|
-
description="Retrieve tables from the MMTab dataset based on queries.",
|
|
672
|
+
description="Retrieve tables from the MMTab dataset based on queries. This dataset is a copy of the original test split from MMTab, taking only items where an 'original_query' is present, and removing the 'input' and 'output' columns, as they are unnecessary for retrieval tasks.",
|
|
673
673
|
reference="https://huggingface.co/datasets/jinaai/MMTab_beir",
|
|
674
674
|
dataset={
|
|
675
675
|
"path": "jinaai/MMTab_beir",
|
|
@@ -690,7 +690,7 @@ class JinaVDRMMTabRetrieval(AbsTaskRetrieval):
|
|
|
690
690
|
class JinaVDRCharXivOCRRetrieval(AbsTaskRetrieval):
|
|
691
691
|
metadata = TaskMetadata(
|
|
692
692
|
name="JinaVDRCharXivOCRRetrieval",
|
|
693
|
-
description="Retrieve charts from scientific papers based on human annotated queries.",
|
|
693
|
+
description="Retrieve charts from scientific papers based on human annotated queries. This dataset is derived from the [CharXiv dataset](https://huggingface.co/datasets/princeton-nlp/CharXiv), reformatting the test split with modified field names, so that it can be used in the ViDoRe benchmark.",
|
|
694
694
|
reference="https://huggingface.co/datasets/jinaai/CharXiv-en_beir",
|
|
695
695
|
dataset={
|
|
696
696
|
"path": "jinaai/CharXiv-en_beir",
|
|
@@ -711,7 +711,7 @@ class JinaVDRCharXivOCRRetrieval(AbsTaskRetrieval):
|
|
|
711
711
|
class JinaVDRStudentEnrollmentSyntheticRetrieval(AbsTaskRetrieval):
|
|
712
712
|
metadata = TaskMetadata(
|
|
713
713
|
name="JinaVDRStudentEnrollmentSyntheticRetrieval",
|
|
714
|
-
description="Retrieve student enrollment data based on templated queries.",
|
|
714
|
+
description="Retrieve student enrollment data based on templated queries. This dataset is created from the original Kaggle [Delaware Student Enrollment](https://www.kaggle.com/datasets/noeyislearning/delaware-student-enrollment) dataset. The charts are rendered and queries created using templates.",
|
|
715
715
|
reference="https://huggingface.co/datasets/jinaai/student-enrollment_beir",
|
|
716
716
|
dataset={
|
|
717
717
|
"path": "jinaai/student-enrollment_beir",
|
|
@@ -732,7 +732,11 @@ class JinaVDRStudentEnrollmentSyntheticRetrieval(AbsTaskRetrieval):
|
|
|
732
732
|
class JinaVDRGitHubReadmeRetrieval(AbsTaskRetrieval):
|
|
733
733
|
metadata = TaskMetadata(
|
|
734
734
|
name="JinaVDRGitHubReadmeRetrieval",
|
|
735
|
-
description=
|
|
735
|
+
description=(
|
|
736
|
+
"Retrieve GitHub readme files based their description. "
|
|
737
|
+
"This dataset consists of rendered GitHub readmes in a variety of different languages, together with their accompanying descriptions as queries and their license in the `license_type` and `license_text` columns. "
|
|
738
|
+
"This particular dataset is a subsample of 1000 random rows per language from the full dataset which can be found [here](https://huggingface.co/datasets/jinaai/github-readme-retrieval-ml-filtered)."
|
|
739
|
+
),
|
|
736
740
|
reference="https://huggingface.co/datasets/jinaai/github-readme-retrieval-multilingual_beir",
|
|
737
741
|
dataset={
|
|
738
742
|
"path": "jinaai/github-readme-retrieval-multilingual_beir",
|
|
@@ -773,7 +777,7 @@ class JinaVDRGitHubReadmeRetrieval(AbsTaskRetrieval):
|
|
|
773
777
|
class JinaVDRTweetStockSyntheticsRetrieval(AbsTaskRetrieval):
|
|
774
778
|
metadata = TaskMetadata(
|
|
775
779
|
name="JinaVDRTweetStockSyntheticsRetrieval",
|
|
776
|
-
description="Retrieve rendered tables of stock prices based on templated queries.",
|
|
780
|
+
description="Retrieve rendered tables of stock prices based on templated queries. This dataset is created from the original Kaggle [Tweet Sentiment's Impact on Stock Returns](https://www.kaggle.com/datasets/thedevastator/tweet-sentiment-s-impact-on-stock-returns) dataset.",
|
|
777
781
|
reference="https://huggingface.co/datasets/jinaai/tweet-stock-synthetic-retrieval_beir",
|
|
778
782
|
dataset={
|
|
779
783
|
"path": "jinaai/tweet-stock-synthetic-retrieval_beir",
|
|
@@ -796,7 +800,7 @@ class JinaVDRTweetStockSyntheticsRetrieval(AbsTaskRetrieval):
|
|
|
796
800
|
class JinaVDRAirbnbSyntheticRetrieval(AbsTaskRetrieval):
|
|
797
801
|
metadata = TaskMetadata(
|
|
798
802
|
name="JinaVDRAirbnbSyntheticRetrieval",
|
|
799
|
-
description="Retrieve rendered tables from Airbnb listings based on templated queries.",
|
|
803
|
+
description="Retrieve rendered tables from Airbnb listings based on templated queries. This dataset is created from the original Kaggle [New York City Airbnb Open Data dataset](https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data).",
|
|
800
804
|
reference="https://huggingface.co/datasets/jinaai/airbnb-synthetic-retrieval_beir",
|
|
801
805
|
dataset={
|
|
802
806
|
"path": "jinaai/airbnb-synthetic-retrieval_beir",
|
|
@@ -819,7 +823,7 @@ class JinaVDRAirbnbSyntheticRetrieval(AbsTaskRetrieval):
|
|
|
819
823
|
class JinaVDRShanghaiMasterPlanRetrieval(AbsTaskRetrieval):
|
|
820
824
|
metadata = TaskMetadata(
|
|
821
825
|
name="JinaVDRShanghaiMasterPlanRetrieval",
|
|
822
|
-
description="Retrieve pages from the Shanghai Master Plan based on human annotated queries.",
|
|
826
|
+
description="Retrieve pages from the Shanghai Master Plan based on human annotated queries. The master plan document is taken from [here](https://www.shanghai.gov.cn/newshanghai/xxgkfj/2035004.pdf).",
|
|
823
827
|
reference="https://huggingface.co/datasets/jinaai/shanghai_master_plan_beir",
|
|
824
828
|
dataset={
|
|
825
829
|
"path": "jinaai/shanghai_master_plan_beir",
|
|
@@ -840,7 +844,7 @@ class JinaVDRShanghaiMasterPlanRetrieval(AbsTaskRetrieval):
|
|
|
840
844
|
class JinaVDRWikimediaCommonsDocumentsRetrieval(AbsTaskRetrieval):
|
|
841
845
|
metadata = TaskMetadata(
|
|
842
846
|
name="JinaVDRWikimediaCommonsDocumentsRetrieval",
|
|
843
|
-
description="Retrieve historical documents from Wikimedia Commons based on their description.",
|
|
847
|
+
description="Retrieve historical documents from Wikimedia Commons based on their description. Wikimedia Commons Documents. It contains images of (mostly historic) documents which should be identified based on their description. We extracted those descriptions from Wikimedia Commons. We have included the license type and a link (`license_text`) to the original Wikimedia Commons page for each extracted image.",
|
|
844
848
|
reference="https://huggingface.co/datasets/jinaai/wikimedia-commons-documents-ml_beir",
|
|
845
849
|
dataset={
|
|
846
850
|
"path": "jinaai/wikimedia-commons-documents-ml_beir",
|
|
@@ -884,7 +888,7 @@ class JinaVDRWikimediaCommonsDocumentsRetrieval(AbsTaskRetrieval):
|
|
|
884
888
|
class JinaVDREuropeanaFrNewsRetrieval(AbsTaskRetrieval):
|
|
885
889
|
metadata = TaskMetadata(
|
|
886
890
|
name="JinaVDREuropeanaFrNewsRetrieval",
|
|
887
|
-
description="Retrieve French news articles from Europeana based on LLM generated queries.",
|
|
891
|
+
description="Retrieve French news articles from Europeana based on LLM generated queries. This dataset was created from records of the [Europeana online collection](https://europeana.eu) by selecting scans of French news articles.",
|
|
888
892
|
reference="https://huggingface.co/datasets/jinaai/europeana-fr-news_beir",
|
|
889
893
|
dataset={
|
|
890
894
|
"path": "jinaai/europeana-fr-news_beir",
|
|
@@ -905,7 +909,7 @@ class JinaVDREuropeanaFrNewsRetrieval(AbsTaskRetrieval):
|
|
|
905
909
|
class JinaVDRDocQAHealthcareIndustryRetrieval(AbsTaskRetrieval):
|
|
906
910
|
metadata = TaskMetadata(
|
|
907
911
|
name="JinaVDRDocQAHealthcareIndustryRetrieval",
|
|
908
|
-
description="Retrieve healthcare industry documents based on LLM generated queries.",
|
|
912
|
+
description="Retrieve healthcare industry documents based on LLM generated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d). For more information regarding the filtering please read [our paper](https://arxiv.org/abs/2506.18902) or [this discussion on github](https://github.com/embeddings-benchmark/mteb/pull/2942#discussion_r2240711654).",
|
|
909
913
|
reference="https://huggingface.co/datasets/jinaai/docqa_healthcare_industry_beir",
|
|
910
914
|
dataset={
|
|
911
915
|
"path": "jinaai/docqa_healthcare_industry_beir",
|
|
@@ -917,6 +921,7 @@ class JinaVDRDocQAHealthcareIndustryRetrieval(AbsTaskRetrieval):
|
|
|
917
921
|
license="mit",
|
|
918
922
|
annotations_creators="derived",
|
|
919
923
|
sample_creation="found",
|
|
924
|
+
adapted_from=["VidoreDocVQARetrieval"],
|
|
920
925
|
**COMMON_METADATA,
|
|
921
926
|
)
|
|
922
927
|
|
|
@@ -926,7 +931,7 @@ class JinaVDRDocQAHealthcareIndustryRetrieval(AbsTaskRetrieval):
|
|
|
926
931
|
class JinaVDRDocQAAI(AbsTaskRetrieval):
|
|
927
932
|
metadata = TaskMetadata(
|
|
928
933
|
name="JinaVDRDocQAAI",
|
|
929
|
-
description="Retrieve AI documents based on LLM generated queries.",
|
|
934
|
+
description="Retrieve AI documents based on LLM generated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
930
935
|
reference="https://huggingface.co/datasets/jinaai/docqa_artificial_intelligence_beir",
|
|
931
936
|
dataset={
|
|
932
937
|
"path": "jinaai/docqa_artificial_intelligence_beir",
|
|
@@ -938,6 +943,7 @@ class JinaVDRDocQAAI(AbsTaskRetrieval):
|
|
|
938
943
|
license="mit",
|
|
939
944
|
annotations_creators="derived",
|
|
940
945
|
sample_creation="found",
|
|
946
|
+
adapted_from=["VidoreDocVQARetrieval"],
|
|
941
947
|
**COMMON_METADATA,
|
|
942
948
|
)
|
|
943
949
|
|
|
@@ -947,7 +953,7 @@ class JinaVDRDocQAAI(AbsTaskRetrieval):
|
|
|
947
953
|
class JinaVDRShiftProjectRetrieval(AbsTaskRetrieval):
|
|
948
954
|
metadata = TaskMetadata(
|
|
949
955
|
name="JinaVDRShiftProjectRetrieval",
|
|
950
|
-
description="Retrieve documents with graphs from the Shift Project based on LLM generated queries.",
|
|
956
|
+
description="Retrieve documents with graphs from the Shift Project based on LLM generated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
951
957
|
reference="https://huggingface.co/datasets/jinaai/shiftproject_beir",
|
|
952
958
|
dataset={
|
|
953
959
|
"path": "jinaai/shiftproject_beir",
|
|
@@ -959,6 +965,7 @@ class JinaVDRShiftProjectRetrieval(AbsTaskRetrieval):
|
|
|
959
965
|
license="mit",
|
|
960
966
|
annotations_creators="derived",
|
|
961
967
|
sample_creation="found",
|
|
968
|
+
adapted_from=["VidoreShiftProjectRetrieval"],
|
|
962
969
|
**COMMON_METADATA,
|
|
963
970
|
)
|
|
964
971
|
|
|
@@ -968,7 +975,7 @@ class JinaVDRShiftProjectRetrieval(AbsTaskRetrieval):
|
|
|
968
975
|
class JinaVDRTatQARetrieval(AbsTaskRetrieval):
|
|
969
976
|
metadata = TaskMetadata(
|
|
970
977
|
name="JinaVDRTatQARetrieval",
|
|
971
|
-
description="Retrieve financial reports based on human annotated queries.",
|
|
978
|
+
description="Retrieve financial reports based on human annotated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
972
979
|
reference="https://huggingface.co/datasets/jinaai/tatqa_beir",
|
|
973
980
|
dataset={
|
|
974
981
|
"path": "jinaai/tatqa_beir",
|
|
@@ -980,6 +987,7 @@ class JinaVDRTatQARetrieval(AbsTaskRetrieval):
|
|
|
980
987
|
license="mit",
|
|
981
988
|
annotations_creators="derived",
|
|
982
989
|
sample_creation="found",
|
|
990
|
+
adapted_from=["VidoreTatdqaRetrieval"],
|
|
983
991
|
**COMMON_METADATA,
|
|
984
992
|
)
|
|
985
993
|
|
|
@@ -989,7 +997,7 @@ class JinaVDRTatQARetrieval(AbsTaskRetrieval):
|
|
|
989
997
|
class JinaVDRInfovqaRetrieval(AbsTaskRetrieval):
|
|
990
998
|
metadata = TaskMetadata(
|
|
991
999
|
name="JinaVDRInfovqaRetrieval",
|
|
992
|
-
description="Retrieve infographics based on human annotated queries.",
|
|
1000
|
+
description="Retrieve infographics based on human annotated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
993
1001
|
reference="https://huggingface.co/datasets/jinaai/infovqa_beir",
|
|
994
1002
|
dataset={
|
|
995
1003
|
"path": "jinaai/infovqa_beir",
|
|
@@ -1001,6 +1009,7 @@ class JinaVDRInfovqaRetrieval(AbsTaskRetrieval):
|
|
|
1001
1009
|
license="mit",
|
|
1002
1010
|
annotations_creators="derived",
|
|
1003
1011
|
sample_creation="found",
|
|
1012
|
+
adapted_from=["VidoreInfoVQARetrieval"],
|
|
1004
1013
|
**COMMON_METADATA,
|
|
1005
1014
|
)
|
|
1006
1015
|
|
|
@@ -1010,7 +1019,7 @@ class JinaVDRInfovqaRetrieval(AbsTaskRetrieval):
|
|
|
1010
1019
|
class JinaVDRDocVQARetrieval(AbsTaskRetrieval):
|
|
1011
1020
|
metadata = TaskMetadata(
|
|
1012
1021
|
name="JinaVDRDocVQARetrieval",
|
|
1013
|
-
description="Retrieve industry documents based on human annotated queries.",
|
|
1022
|
+
description="Retrieve industry documents based on human annotated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
1014
1023
|
reference="https://huggingface.co/datasets/jinaai/docvqa_beir",
|
|
1015
1024
|
dataset={
|
|
1016
1025
|
"path": "jinaai/docvqa_beir",
|
|
@@ -1022,6 +1031,7 @@ class JinaVDRDocVQARetrieval(AbsTaskRetrieval):
|
|
|
1022
1031
|
license="cc-by-4.0",
|
|
1023
1032
|
annotations_creators="LM-generated",
|
|
1024
1033
|
sample_creation="found",
|
|
1034
|
+
adapted_from=["VidoreDocVQARetrieval"],
|
|
1025
1035
|
**COMMON_METADATA,
|
|
1026
1036
|
)
|
|
1027
1037
|
|
|
@@ -1031,7 +1041,7 @@ class JinaVDRDocVQARetrieval(AbsTaskRetrieval):
|
|
|
1031
1041
|
class JinaVDRDocQAGovReportRetrieval(AbsTaskRetrieval):
|
|
1032
1042
|
metadata = TaskMetadata(
|
|
1033
1043
|
name="JinaVDRDocQAGovReportRetrieval",
|
|
1034
|
-
description="Retrieve government reports based on LLM generated queries.",
|
|
1044
|
+
description="Retrieve government reports based on LLM generated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
1035
1045
|
reference="https://huggingface.co/datasets/jinaai/docqa_gov_report_beir",
|
|
1036
1046
|
dataset={
|
|
1037
1047
|
"path": "jinaai/docqa_gov_report_beir",
|
|
@@ -1043,6 +1053,7 @@ class JinaVDRDocQAGovReportRetrieval(AbsTaskRetrieval):
|
|
|
1043
1053
|
license="mit",
|
|
1044
1054
|
annotations_creators="derived",
|
|
1045
1055
|
sample_creation="found",
|
|
1056
|
+
adapted_from=["VidoreDocVQARetrieval"],
|
|
1046
1057
|
**COMMON_METADATA,
|
|
1047
1058
|
)
|
|
1048
1059
|
|
|
@@ -1052,7 +1063,7 @@ class JinaVDRDocQAGovReportRetrieval(AbsTaskRetrieval):
|
|
|
1052
1063
|
class JinaVDRTabFQuadRetrieval(AbsTaskRetrieval):
|
|
1053
1064
|
metadata = TaskMetadata(
|
|
1054
1065
|
name="JinaVDRTabFQuadRetrieval",
|
|
1055
|
-
description="Retrieve tables from industry documents based on LLM generated queries.",
|
|
1066
|
+
description="Retrieve tables from industry documents based on LLM generated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
1056
1067
|
reference="https://huggingface.co/datasets/jinaai/tabfquad_beir",
|
|
1057
1068
|
dataset={
|
|
1058
1069
|
"path": "jinaai/tabfquad_beir",
|
|
@@ -1064,6 +1075,7 @@ class JinaVDRTabFQuadRetrieval(AbsTaskRetrieval):
|
|
|
1064
1075
|
license="mit",
|
|
1065
1076
|
annotations_creators="derived",
|
|
1066
1077
|
sample_creation="found",
|
|
1078
|
+
adapted_from=["VidoreTabfquadRetrieval"],
|
|
1067
1079
|
**COMMON_METADATA,
|
|
1068
1080
|
)
|
|
1069
1081
|
|
|
@@ -1073,7 +1085,7 @@ class JinaVDRTabFQuadRetrieval(AbsTaskRetrieval):
|
|
|
1073
1085
|
class JinaVDRDocQAEnergyRetrieval(AbsTaskRetrieval):
|
|
1074
1086
|
metadata = TaskMetadata(
|
|
1075
1087
|
name="JinaVDRDocQAEnergyRetrieval",
|
|
1076
|
-
description="Retrieve energy industry documents based on LLM generated queries.",
|
|
1088
|
+
description="Retrieve energy industry documents based on LLM generated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
1077
1089
|
reference="https://huggingface.co/datasets/jinaai/docqa_energy_beir",
|
|
1078
1090
|
dataset={
|
|
1079
1091
|
"path": "jinaai/docqa_energy_beir",
|
|
@@ -1085,6 +1097,7 @@ class JinaVDRDocQAEnergyRetrieval(AbsTaskRetrieval):
|
|
|
1085
1097
|
license="mit",
|
|
1086
1098
|
annotations_creators="derived",
|
|
1087
1099
|
sample_creation="found",
|
|
1100
|
+
adapted_from=["VidoreDocVQARetrieval"],
|
|
1088
1101
|
**COMMON_METADATA,
|
|
1089
1102
|
)
|
|
1090
1103
|
|
|
@@ -1094,7 +1107,7 @@ class JinaVDRDocQAEnergyRetrieval(AbsTaskRetrieval):
|
|
|
1094
1107
|
class JinaVDRArxivQARetrieval(AbsTaskRetrieval):
|
|
1095
1108
|
metadata = TaskMetadata(
|
|
1096
1109
|
name="JinaVDRArxivQARetrieval",
|
|
1097
|
-
description="Retrieve figures from scientific papers from arXiv based on LLM generated queries.",
|
|
1110
|
+
description="Retrieve figures from scientific papers from arXiv based on LLM generated queries. This dataset is build upon the corresponding dataset from the [ViDoRe Benchmark](https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d).",
|
|
1098
1111
|
reference="https://huggingface.co/datasets/jinaai/arxivqa_beir",
|
|
1099
1112
|
dataset={
|
|
1100
1113
|
"path": "jinaai/arxivqa_beir",
|
|
@@ -1106,6 +1119,7 @@ class JinaVDRArxivQARetrieval(AbsTaskRetrieval):
|
|
|
1106
1119
|
license="cc-by-4.0",
|
|
1107
1120
|
annotations_creators="LM-generated",
|
|
1108
1121
|
sample_creation="found",
|
|
1122
|
+
adapted_from=["VidoreArxivQARetrieval"],
|
|
1109
1123
|
**COMMON_METADATA,
|
|
1110
1124
|
)
|
|
1111
1125
|
|
|
@@ -41,7 +41,6 @@ def _load_data(
|
|
|
41
41
|
},
|
|
42
42
|
remove_columns=["query-id", "query"],
|
|
43
43
|
)
|
|
44
|
-
query_ds = query_ds.select_columns(["id", "text"])
|
|
45
44
|
|
|
46
45
|
corpus_ds = load_dataset(
|
|
47
46
|
path,
|
|
@@ -66,7 +65,7 @@ def _load_data(
|
|
|
66
65
|
)
|
|
67
66
|
|
|
68
67
|
if langs is None:
|
|
69
|
-
queries[split] = query_ds
|
|
68
|
+
queries[split] = query_ds.select_columns(["id", "text"])
|
|
70
69
|
corpus[split] = corpus_ds
|
|
71
70
|
relevant_docs[split] = {}
|
|
72
71
|
for row in qrels_ds:
|
|
@@ -77,7 +76,8 @@ def _load_data(
|
|
|
77
76
|
relevant_docs[split][qid][did] = int(row["score"])
|
|
78
77
|
else:
|
|
79
78
|
for lang in langs:
|
|
80
|
-
|
|
79
|
+
filtered_query_ds = query_ds.filter(lambda x: x["language"] == lang)
|
|
80
|
+
queries[lang][split] = filtered_query_ds.select_columns(["id", "text"])
|
|
81
81
|
|
|
82
82
|
corpus[lang][split] = corpus_ds
|
|
83
83
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mteb
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.1
|
|
4
4
|
Summary: Massive Text Embedding Benchmark
|
|
5
5
|
Author-email: MTEB Contributors <niklas@huggingface.co>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Nouamane Tazi <nouamane@huggingface.co>, Nils Reimers <info@nils-reimers.de>
|
|
6
6
|
Maintainer-email: Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>, Roman Solomatin <risolomatin@gmail.com>, Isaac Chung <chungisaac1217@gmail.com>
|
|
@@ -91,6 +91,9 @@ Requires-Dist: volcengine-python-sdk[ark]==3.0.2; extra == "ark"
|
|
|
91
91
|
Requires-Dist: tiktoken>=0.8.0; extra == "ark"
|
|
92
92
|
Provides-Extra: colpali-engine
|
|
93
93
|
Requires-Dist: colpali_engine>=0.3.12; extra == "colpali-engine"
|
|
94
|
+
Provides-Extra: colqwen3
|
|
95
|
+
Requires-Dist: transformers>=4.57; extra == "colqwen3"
|
|
96
|
+
Requires-Dist: torchvision>=0.22.1; extra == "colqwen3"
|
|
94
97
|
Provides-Extra: xet
|
|
95
98
|
Requires-Dist: huggingface_hub>=0.32.0; extra == "xet"
|
|
96
99
|
Provides-Extra: youtu
|
|
@@ -100,6 +103,8 @@ Provides-Extra: llama-embed-nemotron
|
|
|
100
103
|
Requires-Dist: transformers==4.51.0; extra == "llama-embed-nemotron"
|
|
101
104
|
Provides-Extra: faiss-cpu
|
|
102
105
|
Requires-Dist: faiss-cpu>=1.12.0; extra == "faiss-cpu"
|
|
106
|
+
Provides-Extra: eager-embed
|
|
107
|
+
Requires-Dist: qwen_vl_utils>=0.0.14; extra == "eager-embed"
|
|
103
108
|
Dynamic: license-file
|
|
104
109
|
|
|
105
110
|
<h1 align="center">
|