evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +2 -3
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.2.dist-info/RECORD +0 -286
  230. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -6,20 +6,20 @@ class T2Retrieval(AbsTaskRetrieval):
6
6
  ignore_identical_ids = True
7
7
 
8
8
  metadata = TaskMetadata(
9
- name="T2Retrieval",
10
- description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking",
11
- reference="https://arxiv.org/abs/2304.03679",
9
+ name='T2Retrieval',
10
+ description='T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
11
+ reference='https://arxiv.org/abs/2304.03679',
12
12
  dataset={
13
- "path": "C-MTEB/T2Retrieval",
14
- "revision": "8731a845f1bf500a4f111cf1070785c793d10e64",
15
- "qrel_revision": "1c83b8d1544e529875e3f6930f3a1fcf749a8e97",
13
+ 'path': 'C-MTEB/T2Retrieval',
14
+ 'revision': '8731a845f1bf500a4f111cf1070785c793d10e64',
15
+ 'qrel_revision': '1c83b8d1544e529875e3f6930f3a1fcf749a8e97',
16
16
  },
17
- type="Retrieval",
18
- category="s2p",
19
- modalities=["text"],
20
- eval_splits=["dev"],
21
- eval_langs=["cmn-Hans"],
22
- main_score="ndcg_at_10",
17
+ type='Retrieval',
18
+ category='s2p',
19
+ modalities=['text'],
20
+ eval_splits=['dev'],
21
+ eval_langs=['cmn-Hans'],
22
+ main_score='ndcg_at_10',
23
23
  date=None,
24
24
  domains=None,
25
25
  task_subtypes=None,
@@ -28,7 +28,7 @@ class T2Retrieval(AbsTaskRetrieval):
28
28
  dialect=None,
29
29
  sample_creation=None,
30
30
  bibtex_citation="""@misc{xie2023t2ranking,
31
- title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
31
+ title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
32
32
  author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma},
33
33
  year={2023},
34
34
  eprint={2304.03679},
@@ -36,14 +36,14 @@ class T2Retrieval(AbsTaskRetrieval):
36
36
  primaryClass={cs.IR}
37
37
  }""",
38
38
  descriptive_stats={
39
- "n_samples": None,
40
- "avg_character_length": {
41
- "dev": {
42
- "average_document_length": 874.1184182791619,
43
- "average_query_length": 10.938847974750132,
44
- "num_documents": 118605,
45
- "num_queries": 22812,
46
- "average_relevant_docs_per_query": 5.213571804313519,
39
+ 'n_samples': None,
40
+ 'avg_character_length': {
41
+ 'dev': {
42
+ 'average_document_length': 874.1184182791619,
43
+ 'average_query_length': 10.938847974750132,
44
+ 'num_documents': 118605,
45
+ 'num_queries': 22812,
46
+ 'average_relevant_docs_per_query': 5.213571804313519,
47
47
  }
48
48
  },
49
49
  },
@@ -54,20 +54,20 @@ class MMarcoRetrieval(AbsTaskRetrieval):
54
54
  ignore_identical_ids = True
55
55
 
56
56
  metadata = TaskMetadata(
57
- name="MMarcoRetrieval",
58
- description="MMarcoRetrieval",
59
- reference="https://arxiv.org/abs/2309.07597",
57
+ name='MMarcoRetrieval',
58
+ description='MMarcoRetrieval',
59
+ reference='https://arxiv.org/abs/2309.07597',
60
60
  dataset={
61
- "path": "C-MTEB/MMarcoRetrieval",
62
- "revision": "539bbde593d947e2a124ba72651aafc09eb33fc2",
63
- "qrel_revision": "bae08bb7bddbedb96c7e7db52018a55167b67f89",
61
+ 'path': 'C-MTEB/MMarcoRetrieval',
62
+ 'revision': '539bbde593d947e2a124ba72651aafc09eb33fc2',
63
+ 'qrel_revision': 'bae08bb7bddbedb96c7e7db52018a55167b67f89',
64
64
  },
65
- type="Retrieval",
66
- category="s2p",
67
- modalities=["text"],
68
- eval_splits=["dev"],
69
- eval_langs=["cmn-Hans"],
70
- main_score="ndcg_at_10",
65
+ type='Retrieval',
66
+ category='s2p',
67
+ modalities=['text'],
68
+ eval_splits=['dev'],
69
+ eval_langs=['cmn-Hans'],
70
+ main_score='ndcg_at_10',
71
71
  date=None,
72
72
  domains=None,
73
73
  task_subtypes=None,
@@ -76,7 +76,7 @@ class MMarcoRetrieval(AbsTaskRetrieval):
76
76
  dialect=None,
77
77
  sample_creation=None,
78
78
  bibtex_citation="""@misc{xiao2024cpack,
79
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
79
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
80
80
  author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
81
81
  year={2024},
82
82
  eprint={2309.07597},
@@ -84,14 +84,14 @@ class MMarcoRetrieval(AbsTaskRetrieval):
84
84
  primaryClass={cs.CL}
85
85
  }""",
86
86
  descriptive_stats={
87
- "n_samples": None,
88
- "avg_character_length": {
89
- "dev": {
90
- "average_document_length": 114.41787048392986,
91
- "average_query_length": 10.51131805157593,
92
- "num_documents": 106813,
93
- "num_queries": 6980,
94
- "average_relevant_docs_per_query": 1.0654727793696275,
87
+ 'n_samples': None,
88
+ 'avg_character_length': {
89
+ 'dev': {
90
+ 'average_document_length': 114.41787048392986,
91
+ 'average_query_length': 10.51131805157593,
92
+ 'num_documents': 106813,
93
+ 'num_queries': 6980,
94
+ 'average_relevant_docs_per_query': 1.0654727793696275,
95
95
  }
96
96
  },
97
97
  },
@@ -100,20 +100,20 @@ class MMarcoRetrieval(AbsTaskRetrieval):
100
100
 
101
101
  class DuRetrieval(AbsTaskRetrieval):
102
102
  metadata = TaskMetadata(
103
- name="DuRetrieval",
104
- description="A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine",
105
- reference="https://aclanthology.org/2022.emnlp-main.357.pdf",
103
+ name='DuRetrieval',
104
+ description='A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine',
105
+ reference='https://aclanthology.org/2022.emnlp-main.357.pdf',
106
106
  dataset={
107
- "path": "C-MTEB/DuRetrieval",
108
- "revision": "a1a333e290fe30b10f3f56498e3a0d911a693ced",
109
- "qrel_revision": "497b7bd1bbb25cb3757ff34d95a8be50a3de2279",
107
+ 'path': 'C-MTEB/DuRetrieval',
108
+ 'revision': 'a1a333e290fe30b10f3f56498e3a0d911a693ced',
109
+ 'qrel_revision': '497b7bd1bbb25cb3757ff34d95a8be50a3de2279',
110
110
  },
111
- type="Retrieval",
112
- category="s2p",
113
- modalities=["text"],
114
- eval_splits=["dev"],
115
- eval_langs=["cmn-Hans"],
116
- main_score="ndcg_at_10",
111
+ type='Retrieval',
112
+ category='s2p',
113
+ modalities=['text'],
114
+ eval_splits=['dev'],
115
+ eval_langs=['cmn-Hans'],
116
+ main_score='ndcg_at_10',
117
117
  date=None,
118
118
  domains=None,
119
119
  task_subtypes=None,
@@ -122,7 +122,7 @@ class DuRetrieval(AbsTaskRetrieval):
122
122
  dialect=None,
123
123
  sample_creation=None,
124
124
  bibtex_citation="""@misc{qiu2022dureaderretrieval,
125
- title={DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine},
125
+ title={DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine},
126
126
  author={Yifu Qiu and Hongyu Li and Yingqi Qu and Ying Chen and Qiaoqiao She and Jing Liu and Hua Wu and Haifeng Wang},
127
127
  year={2022},
128
128
  eprint={2203.10232},
@@ -130,14 +130,14 @@ class DuRetrieval(AbsTaskRetrieval):
130
130
  primaryClass={cs.CL}
131
131
  }""",
132
132
  descriptive_stats={
133
- "n_samples": None,
134
- "avg_character_length": {
135
- "dev": {
136
- "average_document_length": 331.3219967800322,
137
- "average_query_length": 9.289,
138
- "num_documents": 100001,
139
- "num_queries": 2000,
140
- "average_relevant_docs_per_query": 4.9195,
133
+ 'n_samples': None,
134
+ 'avg_character_length': {
135
+ 'dev': {
136
+ 'average_document_length': 331.3219967800322,
137
+ 'average_query_length': 9.289,
138
+ 'num_documents': 100001,
139
+ 'num_queries': 2000,
140
+ 'average_relevant_docs_per_query': 4.9195,
141
141
  }
142
142
  },
143
143
  },
@@ -146,20 +146,20 @@ class DuRetrieval(AbsTaskRetrieval):
146
146
 
147
147
  class CovidRetrieval(AbsTaskRetrieval):
148
148
  metadata = TaskMetadata(
149
- name="CovidRetrieval",
150
- description="COVID-19 news articles",
151
- reference="https://arxiv.org/abs/2203.03367",
149
+ name='CovidRetrieval',
150
+ description='COVID-19 news articles',
151
+ reference='https://arxiv.org/abs/2203.03367',
152
152
  dataset={
153
- "path": "C-MTEB/CovidRetrieval",
154
- "revision": "1271c7809071a13532e05f25fb53511ffce77117",
155
- "qrel_revision": "a9f41b7cdf24785531d12417ce0d1157ed4b39ca",
153
+ 'path': 'C-MTEB/CovidRetrieval',
154
+ 'revision': '1271c7809071a13532e05f25fb53511ffce77117',
155
+ 'qrel_revision': 'a9f41b7cdf24785531d12417ce0d1157ed4b39ca',
156
156
  },
157
- type="Retrieval",
158
- category="s2p",
159
- modalities=["text"],
160
- eval_splits=["dev"],
161
- eval_langs=["cmn-Hans"],
162
- main_score="ndcg_at_10",
157
+ type='Retrieval',
158
+ category='s2p',
159
+ modalities=['text'],
160
+ eval_splits=['dev'],
161
+ eval_langs=['cmn-Hans'],
162
+ main_score='ndcg_at_10',
163
163
  date=None,
164
164
  domains=None,
165
165
  task_subtypes=None,
@@ -169,14 +169,14 @@ class CovidRetrieval(AbsTaskRetrieval):
169
169
  sample_creation=None,
170
170
  bibtex_citation=None,
171
171
  descriptive_stats={
172
- "n_samples": None,
173
- "avg_character_length": {
174
- "dev": {
175
- "average_document_length": 332.4152658473415,
176
- "average_query_length": 25.9304531085353,
177
- "num_documents": 100001,
178
- "num_queries": 949,
179
- "average_relevant_docs_per_query": 1.0105374077976819,
172
+ 'n_samples': None,
173
+ 'avg_character_length': {
174
+ 'dev': {
175
+ 'average_document_length': 332.4152658473415,
176
+ 'average_query_length': 25.9304531085353,
177
+ 'num_documents': 100001,
178
+ 'num_queries': 949,
179
+ 'average_relevant_docs_per_query': 1.0105374077976819,
180
180
  }
181
181
  },
182
182
  },
@@ -185,20 +185,20 @@ class CovidRetrieval(AbsTaskRetrieval):
185
185
 
186
186
  class CmedqaRetrieval(AbsTaskRetrieval):
187
187
  metadata = TaskMetadata(
188
- name="CmedqaRetrieval",
189
- description="Online medical consultation text. Used the CMedQAv2 as its underlying dataset.",
190
- reference="https://aclanthology.org/2022.emnlp-main.357.pdf",
188
+ name='CmedqaRetrieval',
189
+ description='Online medical consultation text. Used the CMedQAv2 as its underlying dataset.',
190
+ reference='https://aclanthology.org/2022.emnlp-main.357.pdf',
191
191
  dataset={
192
- "path": "C-MTEB/CmedqaRetrieval",
193
- "revision": "cd540c506dae1cf9e9a59c3e06f42030d54e7301",
194
- "qrel_revision": "279d737f36c731c8ff6e2b055f31fe02216fa23d",
192
+ 'path': 'C-MTEB/CmedqaRetrieval',
193
+ 'revision': 'cd540c506dae1cf9e9a59c3e06f42030d54e7301',
194
+ 'qrel_revision': '279d737f36c731c8ff6e2b055f31fe02216fa23d',
195
195
  },
196
- type="Retrieval",
197
- category="s2p",
198
- modalities=["text"],
199
- eval_splits=["dev"],
200
- eval_langs=["cmn-Hans"],
201
- main_score="ndcg_at_10",
196
+ type='Retrieval',
197
+ category='s2p',
198
+ modalities=['text'],
199
+ eval_splits=['dev'],
200
+ eval_langs=['cmn-Hans'],
201
+ main_score='ndcg_at_10',
202
202
  date=None,
203
203
  domains=None,
204
204
  task_subtypes=None,
@@ -208,14 +208,14 @@ class CmedqaRetrieval(AbsTaskRetrieval):
208
208
  sample_creation=None,
209
209
  bibtex_citation=None,
210
210
  descriptive_stats={
211
- "n_samples": None,
212
- "avg_character_length": {
213
- "dev": {
214
- "average_document_length": 307.7710222897771,
215
- "average_query_length": 48.470367591897976,
216
- "num_documents": 100001,
217
- "num_queries": 3999,
218
- "average_relevant_docs_per_query": 1.86271567891973,
211
+ 'n_samples': None,
212
+ 'avg_character_length': {
213
+ 'dev': {
214
+ 'average_document_length': 307.7710222897771,
215
+ 'average_query_length': 48.470367591897976,
216
+ 'num_documents': 100001,
217
+ 'num_queries': 3999,
218
+ 'average_relevant_docs_per_query': 1.86271567891973,
219
219
  }
220
220
  },
221
221
  },
@@ -226,20 +226,20 @@ class EcomRetrieval(AbsTaskRetrieval):
226
226
  ignore_identical_ids = True
227
227
 
228
228
  metadata = TaskMetadata(
229
- name="EcomRetrieval",
230
- description="EcomRetrieval",
231
- reference="https://arxiv.org/abs/2203.03367",
229
+ name='EcomRetrieval',
230
+ description='EcomRetrieval',
231
+ reference='https://arxiv.org/abs/2203.03367',
232
232
  dataset={
233
- "path": "C-MTEB/EcomRetrieval",
234
- "revision": "687de13dc7294d6fd9be10c6945f9e8fec8166b9",
235
- "qrel_revision": "39c90699b034ec22ac45b3abf5b0bbb5ffd421f9",
233
+ 'path': 'C-MTEB/EcomRetrieval',
234
+ 'revision': '687de13dc7294d6fd9be10c6945f9e8fec8166b9',
235
+ 'qrel_revision': '39c90699b034ec22ac45b3abf5b0bbb5ffd421f9',
236
236
  },
237
- type="Retrieval",
238
- category="s2p",
239
- modalities=["text"],
240
- eval_splits=["dev"],
241
- eval_langs=["cmn-Hans"],
242
- main_score="ndcg_at_10",
237
+ type='Retrieval',
238
+ category='s2p',
239
+ modalities=['text'],
240
+ eval_splits=['dev'],
241
+ eval_langs=['cmn-Hans'],
242
+ main_score='ndcg_at_10',
243
243
  date=None,
244
244
  domains=None,
245
245
  task_subtypes=None,
@@ -249,14 +249,14 @@ class EcomRetrieval(AbsTaskRetrieval):
249
249
  sample_creation=None,
250
250
  bibtex_citation=None,
251
251
  descriptive_stats={
252
- "n_samples": None,
253
- "avg_character_length": {
254
- "dev": {
255
- "average_document_length": 32.98041664189015,
256
- "average_query_length": 6.798,
257
- "num_documents": 100902,
258
- "num_queries": 1000,
259
- "average_relevant_docs_per_query": 1.0,
252
+ 'n_samples': None,
253
+ 'avg_character_length': {
254
+ 'dev': {
255
+ 'average_document_length': 32.98041664189015,
256
+ 'average_query_length': 6.798,
257
+ 'num_documents': 100902,
258
+ 'num_queries': 1000,
259
+ 'average_relevant_docs_per_query': 1.0,
260
260
  }
261
261
  },
262
262
  },
@@ -267,20 +267,20 @@ class MedicalRetrieval(AbsTaskRetrieval):
267
267
  ignore_identical_ids = True
268
268
 
269
269
  metadata = TaskMetadata(
270
- name="MedicalRetrieval",
271
- description="MedicalRetrieval",
272
- reference="https://arxiv.org/abs/2203.03367",
270
+ name='MedicalRetrieval',
271
+ description='MedicalRetrieval',
272
+ reference='https://arxiv.org/abs/2203.03367',
273
273
  dataset={
274
- "path": "C-MTEB/MedicalRetrieval",
275
- "revision": "2039188fb5800a9803ba5048df7b76e6fb151fc6",
276
- "qrel_revision": "37b8efec53c54c3d9c6af212f6710b62ccdf895c",
274
+ 'path': 'C-MTEB/MedicalRetrieval',
275
+ 'revision': '2039188fb5800a9803ba5048df7b76e6fb151fc6',
276
+ 'qrel_revision': '37b8efec53c54c3d9c6af212f6710b62ccdf895c',
277
277
  },
278
- type="Retrieval",
279
- category="s2p",
280
- modalities=["text"],
281
- eval_splits=["dev"],
282
- eval_langs=["cmn-Hans"],
283
- main_score="ndcg_at_10",
278
+ type='Retrieval',
279
+ category='s2p',
280
+ modalities=['text'],
281
+ eval_splits=['dev'],
282
+ eval_langs=['cmn-Hans'],
283
+ main_score='ndcg_at_10',
284
284
  date=None,
285
285
  domains=None,
286
286
  task_subtypes=None,
@@ -290,14 +290,14 @@ class MedicalRetrieval(AbsTaskRetrieval):
290
290
  sample_creation=None,
291
291
  bibtex_citation=None,
292
292
  descriptive_stats={
293
- "n_samples": None,
294
- "avg_character_length": {
295
- "dev": {
296
- "average_document_length": 122.04231725066585,
297
- "average_query_length": 17.938,
298
- "num_documents": 100999,
299
- "num_queries": 1000,
300
- "average_relevant_docs_per_query": 1.0,
293
+ 'n_samples': None,
294
+ 'avg_character_length': {
295
+ 'dev': {
296
+ 'average_document_length': 122.04231725066585,
297
+ 'average_query_length': 17.938,
298
+ 'num_documents': 100999,
299
+ 'num_queries': 1000,
300
+ 'average_relevant_docs_per_query': 1.0,
301
301
  }
302
302
  },
303
303
  },
@@ -308,20 +308,20 @@ class VideoRetrieval(AbsTaskRetrieval):
308
308
  ignore_identical_ids = True
309
309
 
310
310
  metadata = TaskMetadata(
311
- name="VideoRetrieval",
312
- description="VideoRetrieval",
313
- reference="https://arxiv.org/abs/2203.03367",
311
+ name='VideoRetrieval',
312
+ description='VideoRetrieval',
313
+ reference='https://arxiv.org/abs/2203.03367',
314
314
  dataset={
315
- "path": "C-MTEB/VideoRetrieval",
316
- "revision": "58c2597a5943a2ba48f4668c3b90d796283c5639",
317
- "qrel_revision": "faa71382b6a29cf1778d1f436b963e75cb5b927c",
315
+ 'path': 'C-MTEB/VideoRetrieval',
316
+ 'revision': '58c2597a5943a2ba48f4668c3b90d796283c5639',
317
+ 'qrel_revision': 'faa71382b6a29cf1778d1f436b963e75cb5b927c',
318
318
  },
319
- type="Retrieval",
320
- category="s2p",
321
- modalities=["text"],
322
- eval_splits=["dev"],
323
- eval_langs=["cmn-Hans"],
324
- main_score="ndcg_at_10",
319
+ type='Retrieval',
320
+ category='s2p',
321
+ modalities=['text'],
322
+ eval_splits=['dev'],
323
+ eval_langs=['cmn-Hans'],
324
+ main_score='ndcg_at_10',
325
325
  date=None,
326
326
  domains=None,
327
327
  task_subtypes=None,
@@ -331,14 +331,14 @@ class VideoRetrieval(AbsTaskRetrieval):
331
331
  sample_creation=None,
332
332
  bibtex_citation=None,
333
333
  descriptive_stats={
334
- "n_samples": None,
335
- "avg_character_length": {
336
- "dev": {
337
- "average_document_length": 31.048855642524522,
338
- "average_query_length": 7.365,
339
- "num_documents": 100930,
340
- "num_queries": 1000,
341
- "average_relevant_docs_per_query": 1.0,
334
+ 'n_samples': None,
335
+ 'avg_character_length': {
336
+ 'dev': {
337
+ 'average_document_length': 31.048855642524522,
338
+ 'average_query_length': 7.365,
339
+ 'num_documents': 100930,
340
+ 'num_queries': 1000,
341
+ 'average_relevant_docs_per_query': 1.0,
342
342
  }
343
343
  },
344
344
  },