evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +2 -3
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -6,20 +6,20 @@ class T2Retrieval(AbsTaskRetrieval):
|
|
|
6
6
|
ignore_identical_ids = True
|
|
7
7
|
|
|
8
8
|
metadata = TaskMetadata(
|
|
9
|
-
name=
|
|
10
|
-
description=
|
|
11
|
-
reference=
|
|
9
|
+
name='T2Retrieval',
|
|
10
|
+
description='T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
|
|
11
|
+
reference='https://arxiv.org/abs/2304.03679',
|
|
12
12
|
dataset={
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
'path': 'C-MTEB/T2Retrieval',
|
|
14
|
+
'revision': '8731a845f1bf500a4f111cf1070785c793d10e64',
|
|
15
|
+
'qrel_revision': '1c83b8d1544e529875e3f6930f3a1fcf749a8e97',
|
|
16
16
|
},
|
|
17
|
-
type=
|
|
18
|
-
category=
|
|
19
|
-
modalities=[
|
|
20
|
-
eval_splits=[
|
|
21
|
-
eval_langs=[
|
|
22
|
-
main_score=
|
|
17
|
+
type='Retrieval',
|
|
18
|
+
category='s2p',
|
|
19
|
+
modalities=['text'],
|
|
20
|
+
eval_splits=['dev'],
|
|
21
|
+
eval_langs=['cmn-Hans'],
|
|
22
|
+
main_score='ndcg_at_10',
|
|
23
23
|
date=None,
|
|
24
24
|
domains=None,
|
|
25
25
|
task_subtypes=None,
|
|
@@ -28,7 +28,7 @@ class T2Retrieval(AbsTaskRetrieval):
|
|
|
28
28
|
dialect=None,
|
|
29
29
|
sample_creation=None,
|
|
30
30
|
bibtex_citation="""@misc{xie2023t2ranking,
|
|
31
|
-
title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
|
|
31
|
+
title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
|
|
32
32
|
author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma},
|
|
33
33
|
year={2023},
|
|
34
34
|
eprint={2304.03679},
|
|
@@ -36,14 +36,14 @@ class T2Retrieval(AbsTaskRetrieval):
|
|
|
36
36
|
primaryClass={cs.IR}
|
|
37
37
|
}""",
|
|
38
38
|
descriptive_stats={
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
39
|
+
'n_samples': None,
|
|
40
|
+
'avg_character_length': {
|
|
41
|
+
'dev': {
|
|
42
|
+
'average_document_length': 874.1184182791619,
|
|
43
|
+
'average_query_length': 10.938847974750132,
|
|
44
|
+
'num_documents': 118605,
|
|
45
|
+
'num_queries': 22812,
|
|
46
|
+
'average_relevant_docs_per_query': 5.213571804313519,
|
|
47
47
|
}
|
|
48
48
|
},
|
|
49
49
|
},
|
|
@@ -54,20 +54,20 @@ class MMarcoRetrieval(AbsTaskRetrieval):
|
|
|
54
54
|
ignore_identical_ids = True
|
|
55
55
|
|
|
56
56
|
metadata = TaskMetadata(
|
|
57
|
-
name=
|
|
58
|
-
description=
|
|
59
|
-
reference=
|
|
57
|
+
name='MMarcoRetrieval',
|
|
58
|
+
description='MMarcoRetrieval',
|
|
59
|
+
reference='https://arxiv.org/abs/2309.07597',
|
|
60
60
|
dataset={
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
'path': 'C-MTEB/MMarcoRetrieval',
|
|
62
|
+
'revision': '539bbde593d947e2a124ba72651aafc09eb33fc2',
|
|
63
|
+
'qrel_revision': 'bae08bb7bddbedb96c7e7db52018a55167b67f89',
|
|
64
64
|
},
|
|
65
|
-
type=
|
|
66
|
-
category=
|
|
67
|
-
modalities=[
|
|
68
|
-
eval_splits=[
|
|
69
|
-
eval_langs=[
|
|
70
|
-
main_score=
|
|
65
|
+
type='Retrieval',
|
|
66
|
+
category='s2p',
|
|
67
|
+
modalities=['text'],
|
|
68
|
+
eval_splits=['dev'],
|
|
69
|
+
eval_langs=['cmn-Hans'],
|
|
70
|
+
main_score='ndcg_at_10',
|
|
71
71
|
date=None,
|
|
72
72
|
domains=None,
|
|
73
73
|
task_subtypes=None,
|
|
@@ -76,7 +76,7 @@ class MMarcoRetrieval(AbsTaskRetrieval):
|
|
|
76
76
|
dialect=None,
|
|
77
77
|
sample_creation=None,
|
|
78
78
|
bibtex_citation="""@misc{xiao2024cpack,
|
|
79
|
-
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
79
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
80
80
|
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
81
81
|
year={2024},
|
|
82
82
|
eprint={2309.07597},
|
|
@@ -84,14 +84,14 @@ class MMarcoRetrieval(AbsTaskRetrieval):
|
|
|
84
84
|
primaryClass={cs.CL}
|
|
85
85
|
}""",
|
|
86
86
|
descriptive_stats={
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
87
|
+
'n_samples': None,
|
|
88
|
+
'avg_character_length': {
|
|
89
|
+
'dev': {
|
|
90
|
+
'average_document_length': 114.41787048392986,
|
|
91
|
+
'average_query_length': 10.51131805157593,
|
|
92
|
+
'num_documents': 106813,
|
|
93
|
+
'num_queries': 6980,
|
|
94
|
+
'average_relevant_docs_per_query': 1.0654727793696275,
|
|
95
95
|
}
|
|
96
96
|
},
|
|
97
97
|
},
|
|
@@ -100,20 +100,20 @@ class MMarcoRetrieval(AbsTaskRetrieval):
|
|
|
100
100
|
|
|
101
101
|
class DuRetrieval(AbsTaskRetrieval):
|
|
102
102
|
metadata = TaskMetadata(
|
|
103
|
-
name=
|
|
104
|
-
description=
|
|
105
|
-
reference=
|
|
103
|
+
name='DuRetrieval',
|
|
104
|
+
description='A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine',
|
|
105
|
+
reference='https://aclanthology.org/2022.emnlp-main.357.pdf',
|
|
106
106
|
dataset={
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
107
|
+
'path': 'C-MTEB/DuRetrieval',
|
|
108
|
+
'revision': 'a1a333e290fe30b10f3f56498e3a0d911a693ced',
|
|
109
|
+
'qrel_revision': '497b7bd1bbb25cb3757ff34d95a8be50a3de2279',
|
|
110
110
|
},
|
|
111
|
-
type=
|
|
112
|
-
category=
|
|
113
|
-
modalities=[
|
|
114
|
-
eval_splits=[
|
|
115
|
-
eval_langs=[
|
|
116
|
-
main_score=
|
|
111
|
+
type='Retrieval',
|
|
112
|
+
category='s2p',
|
|
113
|
+
modalities=['text'],
|
|
114
|
+
eval_splits=['dev'],
|
|
115
|
+
eval_langs=['cmn-Hans'],
|
|
116
|
+
main_score='ndcg_at_10',
|
|
117
117
|
date=None,
|
|
118
118
|
domains=None,
|
|
119
119
|
task_subtypes=None,
|
|
@@ -122,7 +122,7 @@ class DuRetrieval(AbsTaskRetrieval):
|
|
|
122
122
|
dialect=None,
|
|
123
123
|
sample_creation=None,
|
|
124
124
|
bibtex_citation="""@misc{qiu2022dureaderretrieval,
|
|
125
|
-
title={DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine},
|
|
125
|
+
title={DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine},
|
|
126
126
|
author={Yifu Qiu and Hongyu Li and Yingqi Qu and Ying Chen and Qiaoqiao She and Jing Liu and Hua Wu and Haifeng Wang},
|
|
127
127
|
year={2022},
|
|
128
128
|
eprint={2203.10232},
|
|
@@ -130,14 +130,14 @@ class DuRetrieval(AbsTaskRetrieval):
|
|
|
130
130
|
primaryClass={cs.CL}
|
|
131
131
|
}""",
|
|
132
132
|
descriptive_stats={
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
133
|
+
'n_samples': None,
|
|
134
|
+
'avg_character_length': {
|
|
135
|
+
'dev': {
|
|
136
|
+
'average_document_length': 331.3219967800322,
|
|
137
|
+
'average_query_length': 9.289,
|
|
138
|
+
'num_documents': 100001,
|
|
139
|
+
'num_queries': 2000,
|
|
140
|
+
'average_relevant_docs_per_query': 4.9195,
|
|
141
141
|
}
|
|
142
142
|
},
|
|
143
143
|
},
|
|
@@ -146,20 +146,20 @@ class DuRetrieval(AbsTaskRetrieval):
|
|
|
146
146
|
|
|
147
147
|
class CovidRetrieval(AbsTaskRetrieval):
|
|
148
148
|
metadata = TaskMetadata(
|
|
149
|
-
name=
|
|
150
|
-
description=
|
|
151
|
-
reference=
|
|
149
|
+
name='CovidRetrieval',
|
|
150
|
+
description='COVID-19 news articles',
|
|
151
|
+
reference='https://arxiv.org/abs/2203.03367',
|
|
152
152
|
dataset={
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
153
|
+
'path': 'C-MTEB/CovidRetrieval',
|
|
154
|
+
'revision': '1271c7809071a13532e05f25fb53511ffce77117',
|
|
155
|
+
'qrel_revision': 'a9f41b7cdf24785531d12417ce0d1157ed4b39ca',
|
|
156
156
|
},
|
|
157
|
-
type=
|
|
158
|
-
category=
|
|
159
|
-
modalities=[
|
|
160
|
-
eval_splits=[
|
|
161
|
-
eval_langs=[
|
|
162
|
-
main_score=
|
|
157
|
+
type='Retrieval',
|
|
158
|
+
category='s2p',
|
|
159
|
+
modalities=['text'],
|
|
160
|
+
eval_splits=['dev'],
|
|
161
|
+
eval_langs=['cmn-Hans'],
|
|
162
|
+
main_score='ndcg_at_10',
|
|
163
163
|
date=None,
|
|
164
164
|
domains=None,
|
|
165
165
|
task_subtypes=None,
|
|
@@ -169,14 +169,14 @@ class CovidRetrieval(AbsTaskRetrieval):
|
|
|
169
169
|
sample_creation=None,
|
|
170
170
|
bibtex_citation=None,
|
|
171
171
|
descriptive_stats={
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
172
|
+
'n_samples': None,
|
|
173
|
+
'avg_character_length': {
|
|
174
|
+
'dev': {
|
|
175
|
+
'average_document_length': 332.4152658473415,
|
|
176
|
+
'average_query_length': 25.9304531085353,
|
|
177
|
+
'num_documents': 100001,
|
|
178
|
+
'num_queries': 949,
|
|
179
|
+
'average_relevant_docs_per_query': 1.0105374077976819,
|
|
180
180
|
}
|
|
181
181
|
},
|
|
182
182
|
},
|
|
@@ -185,20 +185,20 @@ class CovidRetrieval(AbsTaskRetrieval):
|
|
|
185
185
|
|
|
186
186
|
class CmedqaRetrieval(AbsTaskRetrieval):
|
|
187
187
|
metadata = TaskMetadata(
|
|
188
|
-
name=
|
|
189
|
-
description=
|
|
190
|
-
reference=
|
|
188
|
+
name='CmedqaRetrieval',
|
|
189
|
+
description='Online medical consultation text. Used the CMedQAv2 as its underlying dataset.',
|
|
190
|
+
reference='https://aclanthology.org/2022.emnlp-main.357.pdf',
|
|
191
191
|
dataset={
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
192
|
+
'path': 'C-MTEB/CmedqaRetrieval',
|
|
193
|
+
'revision': 'cd540c506dae1cf9e9a59c3e06f42030d54e7301',
|
|
194
|
+
'qrel_revision': '279d737f36c731c8ff6e2b055f31fe02216fa23d',
|
|
195
195
|
},
|
|
196
|
-
type=
|
|
197
|
-
category=
|
|
198
|
-
modalities=[
|
|
199
|
-
eval_splits=[
|
|
200
|
-
eval_langs=[
|
|
201
|
-
main_score=
|
|
196
|
+
type='Retrieval',
|
|
197
|
+
category='s2p',
|
|
198
|
+
modalities=['text'],
|
|
199
|
+
eval_splits=['dev'],
|
|
200
|
+
eval_langs=['cmn-Hans'],
|
|
201
|
+
main_score='ndcg_at_10',
|
|
202
202
|
date=None,
|
|
203
203
|
domains=None,
|
|
204
204
|
task_subtypes=None,
|
|
@@ -208,14 +208,14 @@ class CmedqaRetrieval(AbsTaskRetrieval):
|
|
|
208
208
|
sample_creation=None,
|
|
209
209
|
bibtex_citation=None,
|
|
210
210
|
descriptive_stats={
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
211
|
+
'n_samples': None,
|
|
212
|
+
'avg_character_length': {
|
|
213
|
+
'dev': {
|
|
214
|
+
'average_document_length': 307.7710222897771,
|
|
215
|
+
'average_query_length': 48.470367591897976,
|
|
216
|
+
'num_documents': 100001,
|
|
217
|
+
'num_queries': 3999,
|
|
218
|
+
'average_relevant_docs_per_query': 1.86271567891973,
|
|
219
219
|
}
|
|
220
220
|
},
|
|
221
221
|
},
|
|
@@ -226,20 +226,20 @@ class EcomRetrieval(AbsTaskRetrieval):
|
|
|
226
226
|
ignore_identical_ids = True
|
|
227
227
|
|
|
228
228
|
metadata = TaskMetadata(
|
|
229
|
-
name=
|
|
230
|
-
description=
|
|
231
|
-
reference=
|
|
229
|
+
name='EcomRetrieval',
|
|
230
|
+
description='EcomRetrieval',
|
|
231
|
+
reference='https://arxiv.org/abs/2203.03367',
|
|
232
232
|
dataset={
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
233
|
+
'path': 'C-MTEB/EcomRetrieval',
|
|
234
|
+
'revision': '687de13dc7294d6fd9be10c6945f9e8fec8166b9',
|
|
235
|
+
'qrel_revision': '39c90699b034ec22ac45b3abf5b0bbb5ffd421f9',
|
|
236
236
|
},
|
|
237
|
-
type=
|
|
238
|
-
category=
|
|
239
|
-
modalities=[
|
|
240
|
-
eval_splits=[
|
|
241
|
-
eval_langs=[
|
|
242
|
-
main_score=
|
|
237
|
+
type='Retrieval',
|
|
238
|
+
category='s2p',
|
|
239
|
+
modalities=['text'],
|
|
240
|
+
eval_splits=['dev'],
|
|
241
|
+
eval_langs=['cmn-Hans'],
|
|
242
|
+
main_score='ndcg_at_10',
|
|
243
243
|
date=None,
|
|
244
244
|
domains=None,
|
|
245
245
|
task_subtypes=None,
|
|
@@ -249,14 +249,14 @@ class EcomRetrieval(AbsTaskRetrieval):
|
|
|
249
249
|
sample_creation=None,
|
|
250
250
|
bibtex_citation=None,
|
|
251
251
|
descriptive_stats={
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
252
|
+
'n_samples': None,
|
|
253
|
+
'avg_character_length': {
|
|
254
|
+
'dev': {
|
|
255
|
+
'average_document_length': 32.98041664189015,
|
|
256
|
+
'average_query_length': 6.798,
|
|
257
|
+
'num_documents': 100902,
|
|
258
|
+
'num_queries': 1000,
|
|
259
|
+
'average_relevant_docs_per_query': 1.0,
|
|
260
260
|
}
|
|
261
261
|
},
|
|
262
262
|
},
|
|
@@ -267,20 +267,20 @@ class MedicalRetrieval(AbsTaskRetrieval):
|
|
|
267
267
|
ignore_identical_ids = True
|
|
268
268
|
|
|
269
269
|
metadata = TaskMetadata(
|
|
270
|
-
name=
|
|
271
|
-
description=
|
|
272
|
-
reference=
|
|
270
|
+
name='MedicalRetrieval',
|
|
271
|
+
description='MedicalRetrieval',
|
|
272
|
+
reference='https://arxiv.org/abs/2203.03367',
|
|
273
273
|
dataset={
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
274
|
+
'path': 'C-MTEB/MedicalRetrieval',
|
|
275
|
+
'revision': '2039188fb5800a9803ba5048df7b76e6fb151fc6',
|
|
276
|
+
'qrel_revision': '37b8efec53c54c3d9c6af212f6710b62ccdf895c',
|
|
277
277
|
},
|
|
278
|
-
type=
|
|
279
|
-
category=
|
|
280
|
-
modalities=[
|
|
281
|
-
eval_splits=[
|
|
282
|
-
eval_langs=[
|
|
283
|
-
main_score=
|
|
278
|
+
type='Retrieval',
|
|
279
|
+
category='s2p',
|
|
280
|
+
modalities=['text'],
|
|
281
|
+
eval_splits=['dev'],
|
|
282
|
+
eval_langs=['cmn-Hans'],
|
|
283
|
+
main_score='ndcg_at_10',
|
|
284
284
|
date=None,
|
|
285
285
|
domains=None,
|
|
286
286
|
task_subtypes=None,
|
|
@@ -290,14 +290,14 @@ class MedicalRetrieval(AbsTaskRetrieval):
|
|
|
290
290
|
sample_creation=None,
|
|
291
291
|
bibtex_citation=None,
|
|
292
292
|
descriptive_stats={
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
293
|
+
'n_samples': None,
|
|
294
|
+
'avg_character_length': {
|
|
295
|
+
'dev': {
|
|
296
|
+
'average_document_length': 122.04231725066585,
|
|
297
|
+
'average_query_length': 17.938,
|
|
298
|
+
'num_documents': 100999,
|
|
299
|
+
'num_queries': 1000,
|
|
300
|
+
'average_relevant_docs_per_query': 1.0,
|
|
301
301
|
}
|
|
302
302
|
},
|
|
303
303
|
},
|
|
@@ -308,20 +308,20 @@ class VideoRetrieval(AbsTaskRetrieval):
|
|
|
308
308
|
ignore_identical_ids = True
|
|
309
309
|
|
|
310
310
|
metadata = TaskMetadata(
|
|
311
|
-
name=
|
|
312
|
-
description=
|
|
313
|
-
reference=
|
|
311
|
+
name='VideoRetrieval',
|
|
312
|
+
description='VideoRetrieval',
|
|
313
|
+
reference='https://arxiv.org/abs/2203.03367',
|
|
314
314
|
dataset={
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
315
|
+
'path': 'C-MTEB/VideoRetrieval',
|
|
316
|
+
'revision': '58c2597a5943a2ba48f4668c3b90d796283c5639',
|
|
317
|
+
'qrel_revision': 'faa71382b6a29cf1778d1f436b963e75cb5b927c',
|
|
318
318
|
},
|
|
319
|
-
type=
|
|
320
|
-
category=
|
|
321
|
-
modalities=[
|
|
322
|
-
eval_splits=[
|
|
323
|
-
eval_langs=[
|
|
324
|
-
main_score=
|
|
319
|
+
type='Retrieval',
|
|
320
|
+
category='s2p',
|
|
321
|
+
modalities=['text'],
|
|
322
|
+
eval_splits=['dev'],
|
|
323
|
+
eval_langs=['cmn-Hans'],
|
|
324
|
+
main_score='ndcg_at_10',
|
|
325
325
|
date=None,
|
|
326
326
|
domains=None,
|
|
327
327
|
task_subtypes=None,
|
|
@@ -331,14 +331,14 @@ class VideoRetrieval(AbsTaskRetrieval):
|
|
|
331
331
|
sample_creation=None,
|
|
332
332
|
bibtex_citation=None,
|
|
333
333
|
descriptive_stats={
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
334
|
+
'n_samples': None,
|
|
335
|
+
'avg_character_length': {
|
|
336
|
+
'dev': {
|
|
337
|
+
'average_document_length': 31.048855642524522,
|
|
338
|
+
'average_query_length': 7.365,
|
|
339
|
+
'num_documents': 100930,
|
|
340
|
+
'num_queries': 1000,
|
|
341
|
+
'average_relevant_docs_per_query': 1.0,
|
|
342
342
|
}
|
|
343
343
|
},
|
|
344
344
|
},
|