evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +2 -3
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.2.dist-info/RECORD +0 -286
  230. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -4,19 +4,19 @@ from mteb.abstasks.TaskMetadata import TaskMetadata
4
4
 
5
5
  class TNews(AbsTaskClassification):
6
6
  metadata = TaskMetadata(
7
- name="TNews",
8
- description="Short Text Classification for News",
9
- reference="https://www.cluebenchmarks.com/introduce.html",
7
+ name='TNews',
8
+ description='Short Text Classification for News',
9
+ reference='https://www.cluebenchmarks.com/introduce.html',
10
10
  dataset={
11
- "path": "C-MTEB/TNews-classification",
12
- "revision": "317f262bf1e6126357bbe89e875451e4b0938fe4",
11
+ 'path': 'C-MTEB/TNews-classification',
12
+ 'revision': '317f262bf1e6126357bbe89e875451e4b0938fe4',
13
13
  },
14
- type="Classification",
15
- category="s2s",
16
- modalities=["text"],
17
- eval_splits=["validation"],
18
- eval_langs=["cmn-Hans"],
19
- main_score="accuracy",
14
+ type='Classification',
15
+ category='s2s',
16
+ modalities=['text'],
17
+ eval_splits=['validation'],
18
+ eval_langs=['cmn-Hans'],
19
+ main_score='accuracy',
20
20
  date=None,
21
21
  domains=None,
22
22
  task_subtypes=None,
@@ -67,31 +67,34 @@ class TNews(AbsTaskClassification):
67
67
  doi = "10.18653/v1/2020.coling-main.419",
68
68
  pages = "4762--4772",
69
69
  }""",
70
- descriptive_stats={"n_samples": None, "avg_character_length": None},
70
+ descriptive_stats={
71
+ 'n_samples': None,
72
+ 'avg_character_length': None
73
+ },
71
74
  )
72
75
 
73
76
  @property
74
77
  def metadata_dict(self) -> dict[str, str]:
75
78
  metadata_dict = super().metadata_dict
76
- metadata_dict["samples_per_label"] = 32
79
+ metadata_dict['samples_per_label'] = 32
77
80
  return metadata_dict
78
81
 
79
82
 
80
83
  class IFlyTek(AbsTaskClassification):
81
84
  metadata = TaskMetadata(
82
- name="IFlyTek",
83
- description="Long Text classification for the description of Apps",
84
- reference="https://www.cluebenchmarks.com/introduce.html",
85
+ name='IFlyTek',
86
+ description='Long Text classification for the description of Apps',
87
+ reference='https://www.cluebenchmarks.com/introduce.html',
85
88
  dataset={
86
- "path": "C-MTEB/IFlyTek-classification",
87
- "revision": "421605374b29664c5fc098418fe20ada9bd55f8a",
89
+ 'path': 'C-MTEB/IFlyTek-classification',
90
+ 'revision': '421605374b29664c5fc098418fe20ada9bd55f8a',
88
91
  },
89
- type="Classification",
90
- category="s2s",
91
- modalities=["text"],
92
- eval_splits=["validation"],
93
- eval_langs=["cmn-Hans"],
94
- main_score="accuracy",
92
+ type='Classification',
93
+ category='s2s',
94
+ modalities=['text'],
95
+ eval_splits=['validation'],
96
+ eval_langs=['cmn-Hans'],
97
+ main_score='accuracy',
95
98
  date=None,
96
99
  domains=None,
97
100
  task_subtypes=None,
@@ -143,32 +146,36 @@ class IFlyTek(AbsTaskClassification):
143
146
  pages = "4762--4772",
144
147
  abstract = "The advent of natural language understanding (NLU) benchmarks for English, such as GLUE and SuperGLUE allows new NLU models to be evaluated across a diverse set of tasks. These comprehensive benchmarks have facilitated a broad range of research and applications in natural language processing (NLP). The problem, however, is that most such benchmarks are limited to English, which has made it difficult to replicate many of the successes in English NLU for other languages. To help remedy this issue, we introduce the first large-scale Chinese Language Understanding Evaluation (CLUE) benchmark. CLUE is an open-ended, community-driven project that brings together 9 tasks spanning several well-established single-sentence/sentence-pair classification tasks, as well as machine reading comprehension, all on original Chinese text. To establish results on these tasks, we report scores using an exhaustive set of current state-of-the-art pre-trained Chinese models (9 in total). We also introduce a number of supplementary datasets and additional tools to help facilitate further progress on Chinese NLU. Our benchmark is released at https://www.cluebenchmarks.com",
145
148
  }""",
146
- descriptive_stats={"n_samples": None, "avg_character_length": None},
149
+ descriptive_stats={
150
+ 'n_samples': None,
151
+ 'avg_character_length': None
152
+ },
147
153
  )
148
154
 
149
155
  @property
150
156
  def metadata_dict(self) -> dict[str, str]:
151
157
  metadata_dict = super().metadata_dict
152
- metadata_dict["samples_per_label"] = 32
153
- metadata_dict["n_experiments"] = 5
158
+ metadata_dict['samples_per_label'] = 32
159
+ metadata_dict['n_experiments'] = 5
154
160
  return metadata_dict
155
161
 
156
162
 
157
163
  class MultilingualSentiment(AbsTaskClassification):
158
164
  metadata = TaskMetadata(
159
- name="MultilingualSentiment",
160
- description="A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative",
161
- reference="https://github.com/tyqiangz/multilingual-sentiment-datasets",
165
+ name='MultilingualSentiment',
166
+ description=
167
+ 'A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative',
168
+ reference='https://github.com/tyqiangz/multilingual-sentiment-datasets',
162
169
  dataset={
163
- "path": "C-MTEB/MultilingualSentiment-classification",
164
- "revision": "46958b007a63fdbf239b7672c25d0bea67b5ea1a",
170
+ 'path': 'C-MTEB/MultilingualSentiment-classification',
171
+ 'revision': '46958b007a63fdbf239b7672c25d0bea67b5ea1a',
165
172
  },
166
- type="Classification",
167
- category="s2s",
168
- modalities=["text"],
169
- eval_splits=["validation", "test"],
170
- eval_langs=["cmn-Hans"],
171
- main_score="accuracy",
173
+ type='Classification',
174
+ category='s2s',
175
+ modalities=['text'],
176
+ eval_splits=['validation', 'test'],
177
+ eval_langs=['cmn-Hans'],
178
+ main_score='accuracy',
172
179
  date=None,
173
180
  domains=None,
174
181
  task_subtypes=None,
@@ -177,31 +184,34 @@ class MultilingualSentiment(AbsTaskClassification):
177
184
  dialect=None,
178
185
  sample_creation=None,
179
186
  bibtex_citation=None,
180
- descriptive_stats={"n_samples": None, "avg_character_length": None},
187
+ descriptive_stats={
188
+ 'n_samples': None,
189
+ 'avg_character_length': None
190
+ },
181
191
  )
182
192
 
183
193
  @property
184
194
  def metadata_dict(self) -> dict[str, str]:
185
195
  metadata_dict = super().metadata_dict
186
- metadata_dict["samples_per_label"] = 32
196
+ metadata_dict['samples_per_label'] = 32
187
197
  return metadata_dict
188
198
 
189
199
 
190
200
  class JDReview(AbsTaskClassification):
191
201
  metadata = TaskMetadata(
192
- name="JDReview",
193
- description="review for iphone",
194
- reference="https://aclanthology.org/2023.nodalida-1.20/",
202
+ name='JDReview',
203
+ description='review for iphone',
204
+ reference='https://aclanthology.org/2023.nodalida-1.20/',
195
205
  dataset={
196
- "path": "C-MTEB/JDReview-classification",
197
- "revision": "b7c64bd89eb87f8ded463478346f76731f07bf8b",
206
+ 'path': 'C-MTEB/JDReview-classification',
207
+ 'revision': 'b7c64bd89eb87f8ded463478346f76731f07bf8b',
198
208
  },
199
- type="Classification",
200
- category="s2s",
201
- modalities=["text"],
202
- eval_splits=["test"],
203
- eval_langs=["cmn-Hans"],
204
- main_score="accuracy",
209
+ type='Classification',
210
+ category='s2s',
211
+ modalities=['text'],
212
+ eval_splits=['test'],
213
+ eval_langs=['cmn-Hans'],
214
+ main_score='accuracy',
205
215
  date=None,
206
216
  domains=None,
207
217
  task_subtypes=None,
@@ -215,31 +225,34 @@ class JDReview(AbsTaskClassification):
215
225
  journal={arXiv preprint arXiv:2309.07597},
216
226
  year={2023}
217
227
  }""",
218
- descriptive_stats={"n_samples": None, "avg_character_length": None},
228
+ descriptive_stats={
229
+ 'n_samples': None,
230
+ 'avg_character_length': None
231
+ },
219
232
  )
220
233
 
221
234
  @property
222
235
  def metadata_dict(self) -> dict[str, str]:
223
236
  metadata_dict = super().metadata_dict
224
- metadata_dict["samples_per_label"] = 32
237
+ metadata_dict['samples_per_label'] = 32
225
238
  return metadata_dict
226
239
 
227
240
 
228
241
  class OnlineShopping(AbsTaskClassification):
229
242
  metadata = TaskMetadata(
230
- name="OnlineShopping",
231
- description="Sentiment Analysis of User Reviews on Online Shopping Websites",
232
- reference="https://aclanthology.org/2023.nodalida-1.20/",
243
+ name='OnlineShopping',
244
+ description='Sentiment Analysis of User Reviews on Online Shopping Websites',
245
+ reference='https://aclanthology.org/2023.nodalida-1.20/',
233
246
  dataset={
234
- "path": "C-MTEB/OnlineShopping-classification",
235
- "revision": "e610f2ebd179a8fda30ae534c3878750a96db120",
247
+ 'path': 'C-MTEB/OnlineShopping-classification',
248
+ 'revision': 'e610f2ebd179a8fda30ae534c3878750a96db120',
236
249
  },
237
- type="Classification",
238
- category="s2s",
239
- modalities=["text"],
240
- eval_splits=["test"],
241
- eval_langs=["cmn-Hans"],
242
- main_score="accuracy",
250
+ type='Classification',
251
+ category='s2s',
252
+ modalities=['text'],
253
+ eval_splits=['test'],
254
+ eval_langs=['cmn-Hans'],
255
+ main_score='accuracy',
243
256
  date=None,
244
257
  domains=None,
245
258
  task_subtypes=None,
@@ -253,31 +266,34 @@ class OnlineShopping(AbsTaskClassification):
253
266
  journal={arXiv preprint arXiv:2309.07597},
254
267
  year={2023}
255
268
  }""",
256
- descriptive_stats={"n_samples": None, "avg_character_length": None},
269
+ descriptive_stats={
270
+ 'n_samples': None,
271
+ 'avg_character_length': None
272
+ },
257
273
  )
258
274
 
259
275
  @property
260
276
  def metadata_dict(self) -> dict[str, str]:
261
277
  metadata_dict = super().metadata_dict
262
- metadata_dict["samples_per_label"] = 32
278
+ metadata_dict['samples_per_label'] = 32
263
279
  return metadata_dict
264
280
 
265
281
 
266
282
  class Waimai(AbsTaskClassification):
267
283
  metadata = TaskMetadata(
268
- name="Waimai",
269
- description="Sentiment Analysis of user reviews on takeaway platforms",
270
- reference="https://aclanthology.org/2023.nodalida-1.20/",
284
+ name='Waimai',
285
+ description='Sentiment Analysis of user reviews on takeaway platforms',
286
+ reference='https://aclanthology.org/2023.nodalida-1.20/',
271
287
  dataset={
272
- "path": "C-MTEB/waimai-classification",
273
- "revision": "339287def212450dcaa9df8c22bf93e9980c7023",
288
+ 'path': 'C-MTEB/waimai-classification',
289
+ 'revision': '339287def212450dcaa9df8c22bf93e9980c7023',
274
290
  },
275
- type="Classification",
276
- category="s2s",
277
- modalities=["text"],
278
- eval_splits=["test"],
279
- eval_langs=["cmn-Hans"],
280
- main_score="accuracy",
291
+ type='Classification',
292
+ category='s2s',
293
+ modalities=['text'],
294
+ eval_splits=['test'],
295
+ eval_langs=['cmn-Hans'],
296
+ main_score='accuracy',
281
297
  date=None,
282
298
  domains=None,
283
299
  task_subtypes=None,
@@ -291,12 +307,15 @@ class Waimai(AbsTaskClassification):
291
307
  journal={arXiv preprint arXiv:2309.07597},
292
308
  year={2023}
293
309
  }""",
294
- descriptive_stats={"n_samples": None, "avg_character_length": None},
310
+ descriptive_stats={
311
+ 'n_samples': None,
312
+ 'avg_character_length': None
313
+ },
295
314
  )
296
315
 
297
316
  @property
298
317
  def metadata_dict(self) -> dict[str, str]:
299
318
  metadata_dict = super().metadata_dict
300
- metadata_dict["samples_per_label"] = 32
319
+ metadata_dict['samples_per_label'] = 32
301
320
 
302
321
  return metadata_dict
@@ -1,12 +1,7 @@
1
1
  import itertools
2
-
3
2
  from datasets import Dataset, DatasetDict
4
-
5
3
  from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
6
- from mteb.abstasks.AbsTaskClusteringFast import (
7
- AbsTaskClusteringFast,
8
- check_label_distribution,
9
- )
4
+ from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, check_label_distribution
10
5
  from mteb.abstasks.TaskMetadata import TaskMetadata
11
6
 
12
7
  NUM_SAMPLES = 2048
@@ -46,7 +41,9 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
46
41
  primaryClass={cs.CL}
47
42
  }""", # noqa
48
43
  descriptive_stats={
49
- 'n_samples': {'test': NUM_SAMPLES},
44
+ 'n_samples': {
45
+ 'test': NUM_SAMPLES
46
+ },
50
47
  'avg_character_length': {},
51
48
  },
52
49
  )
@@ -55,9 +52,7 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
55
52
  ds = {}
56
53
  for split in self.metadata.eval_splits:
57
54
  labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
58
- sentences = list(
59
- itertools.chain.from_iterable(self.dataset[split]['sentences'])
60
- )
55
+ sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
61
56
 
62
57
  check_label_distribution(self.dataset[split])
63
58
 
@@ -106,7 +101,9 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
106
101
  primaryClass={cs.CL}
107
102
  }""", # noqa
108
103
  descriptive_stats={
109
- 'n_samples': {'test': NUM_SAMPLES},
104
+ 'n_samples': {
105
+ 'test': NUM_SAMPLES
106
+ },
110
107
  'avg_character_length': {},
111
108
  },
112
109
  )
@@ -115,9 +112,7 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
115
112
  ds = {}
116
113
  for split in self.metadata.eval_splits:
117
114
  labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
118
- sentences = list(
119
- itertools.chain.from_iterable(self.dataset[split]['sentences'])
120
- )
115
+ sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
121
116
 
122
117
  check_label_distribution(self.dataset[split])
123
118
 
@@ -166,7 +161,9 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
166
161
  url = {https://github.com/thunlp/THUCTC}
167
162
  }""",
168
163
  descriptive_stats={
169
- 'n_samples': {'test': NUM_SAMPLES},
164
+ 'n_samples': {
165
+ 'test': NUM_SAMPLES
166
+ },
170
167
  'avg_character_length': {},
171
168
  },
172
169
  )
@@ -175,9 +172,7 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
175
172
  ds = {}
176
173
  for split in self.metadata.eval_splits:
177
174
  labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
178
- sentences = list(
179
- itertools.chain.from_iterable(self.dataset[split]['sentences'])
180
- )
175
+ sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
181
176
 
182
177
  check_label_distribution(self.dataset[split])
183
178
 
@@ -226,7 +221,9 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
226
221
  url = {https://github.com/thunlp/THUCTC}
227
222
  }""",
228
223
  descriptive_stats={
229
- 'n_samples': {'test': NUM_SAMPLES},
224
+ 'n_samples': {
225
+ 'test': NUM_SAMPLES
226
+ },
230
227
  'avg_character_length': {},
231
228
  },
232
229
  )
@@ -235,9 +232,7 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
235
232
  ds = {}
236
233
  for split in self.metadata.eval_splits:
237
234
  labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
238
- sentences = list(
239
- itertools.chain.from_iterable(self.dataset[split]['sentences'])
240
- )
235
+ sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
241
236
 
242
237
  check_label_distribution(self.dataset[split])
243
238
 
@@ -1,31 +1,29 @@
1
- from typing import Optional
2
1
  from mteb import AbsTaskRetrieval
3
2
  from mteb import HFDataLoader as CustomDataLoader
4
3
  from mteb.abstasks.TaskMetadata import TaskMetadata
4
+ from typing import Optional
5
5
 
6
6
 
7
7
  class CustomRetrieval(AbsTaskRetrieval):
8
8
  metadata: TaskMetadata
9
9
  ignore_identical_ids: bool = True
10
10
 
11
- def __init__(
12
- self, dataset_path: Optional[str] = "custom_eval/text/retrieval", **kwargs
13
- ):
11
+ def __init__(self, dataset_path: Optional[str] = 'custom_eval/text/retrieval', **kwargs):
14
12
  super().__init__(**kwargs)
15
13
  self.metadata = TaskMetadata(
16
- name="CustomRetrieval",
17
- description="CustomRetrieval Task",
14
+ name='CustomRetrieval',
15
+ description='CustomRetrieval Task',
18
16
  reference=None,
19
17
  dataset={
20
- "path": dataset_path,
21
- "revision": "v1",
18
+ 'path': dataset_path,
19
+ 'revision': 'v1',
22
20
  },
23
- type="Retrieval",
24
- category="s2p",
25
- modalities=["text"],
26
- eval_splits=["test"],
27
- eval_langs=["cmn-Hans"],
28
- main_score="recall_at_5",
21
+ type='Retrieval',
22
+ category='s2p',
23
+ modalities=['text'],
24
+ eval_splits=['test'],
25
+ eval_langs=['cmn-Hans'],
26
+ main_score='recall_at_5',
29
27
  date=None,
30
28
  domains=None,
31
29
  task_subtypes=None,
@@ -33,7 +31,7 @@ class CustomRetrieval(AbsTaskRetrieval):
33
31
  annotations_creators=None,
34
32
  dialect=None,
35
33
  sample_creation=None,
36
- bibtex_citation="",
34
+ bibtex_citation='',
37
35
  descriptive_stats={},
38
36
  )
39
37
 
@@ -41,17 +39,17 @@ class CustomRetrieval(AbsTaskRetrieval):
41
39
  if self.data_loaded:
42
40
  return
43
41
  self.corpus, self.queries, self.relevant_docs = {}, {}, {}
44
- dataset_path = self.metadata_dict["dataset"]["path"]
42
+ dataset_path = self.metadata_dict['dataset']['path']
45
43
 
46
- for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]):
44
+ for split in kwargs.get('eval_splits', self.metadata_dict['eval_splits']):
47
45
  corpus, queries, qrels = CustomDataLoader(
48
46
  data_folder=dataset_path,
49
47
  streaming=False,
50
48
  keep_in_memory=False,
51
49
  ).load(split=split)
52
50
  # Conversion from DataSet
53
- queries = {query["id"]: query["text"] for query in queries}
54
- corpus = {doc["id"]: {"text": doc["text"]} for doc in corpus}
51
+ queries = {query['id']: query['text'] for query in queries}
52
+ corpus = {doc['id']: {'text': doc['text']} for doc in corpus}
55
53
  self.corpus[split], self.queries[split], self.relevant_docs[split] = (
56
54
  corpus,
57
55
  queries,
@@ -4,19 +4,19 @@ from mteb.abstasks.TaskMetadata import TaskMetadata
4
4
 
5
5
  class Ocnli(AbsTaskPairClassification):
6
6
  metadata = TaskMetadata(
7
- name="Ocnli",
8
- description="Original Chinese Natural Language Inference dataset",
9
- reference="https://arxiv.org/abs/2010.05444",
7
+ name='Ocnli',
8
+ description='Original Chinese Natural Language Inference dataset',
9
+ reference='https://arxiv.org/abs/2010.05444',
10
10
  dataset={
11
- "path": "C-MTEB/OCNLI",
12
- "revision": "66e76a618a34d6d565d5538088562851e6daa7ec",
11
+ 'path': 'C-MTEB/OCNLI',
12
+ 'revision': '66e76a618a34d6d565d5538088562851e6daa7ec',
13
13
  },
14
- type="PairClassification",
15
- category="s2s",
16
- modalities=["text"],
17
- eval_splits=["validation"],
18
- eval_langs=["cmn-Hans"],
19
- main_score="max_accuracy",
14
+ type='PairClassification',
15
+ category='s2s',
16
+ modalities=['text'],
17
+ eval_splits=['validation'],
18
+ eval_langs=['cmn-Hans'],
19
+ main_score='max_accuracy',
20
20
  date=None,
21
21
  domains=None,
22
22
  task_subtypes=None,
@@ -25,36 +25,39 @@ class Ocnli(AbsTaskPairClassification):
25
25
  dialect=None,
26
26
  sample_creation=None,
27
27
  bibtex_citation="""@misc{hu2020ocnli,
28
- title={OCNLI: Original Chinese Natural Language Inference},
28
+ title={OCNLI: Original Chinese Natural Language Inference},
29
29
  author={Hai Hu and Kyle Richardson and Liang Xu and Lu Li and Sandra Kuebler and Lawrence S. Moss},
30
30
  year={2020},
31
31
  eprint={2010.05444},
32
32
  archivePrefix={arXiv},
33
33
  primaryClass={cs.CL}
34
34
  }""",
35
- descriptive_stats={"n_samples": None, "avg_character_length": None},
35
+ descriptive_stats={
36
+ 'n_samples': None,
37
+ 'avg_character_length': None
38
+ },
36
39
  )
37
40
 
38
41
  def dataset_transform(self):
39
- self.dataset = self.dataset.rename_column("sent1", "sentence1")
40
- self.dataset = self.dataset.rename_column("sent2", "sentence2")
42
+ self.dataset = self.dataset.rename_column('sent1', 'sentence1')
43
+ self.dataset = self.dataset.rename_column('sent2', 'sentence2')
41
44
 
42
45
 
43
46
  class Cmnli(AbsTaskPairClassification):
44
47
  metadata = TaskMetadata(
45
- name="Cmnli",
46
- description="Chinese Multi-Genre NLI",
47
- reference="https://huggingface.co/datasets/clue/viewer/cmnli",
48
+ name='Cmnli',
49
+ description='Chinese Multi-Genre NLI',
50
+ reference='https://huggingface.co/datasets/clue/viewer/cmnli',
48
51
  dataset={
49
- "path": "C-MTEB/CMNLI",
50
- "revision": "41bc36f332156f7adc9e38f53777c959b2ae9766",
52
+ 'path': 'C-MTEB/CMNLI',
53
+ 'revision': '41bc36f332156f7adc9e38f53777c959b2ae9766',
51
54
  },
52
- type="PairClassification",
53
- category="s2s",
54
- modalities=["text"],
55
- eval_splits=["validation", "test"],
56
- eval_langs=["cmn-Hans"],
57
- main_score="max_accuracy",
55
+ type='PairClassification',
56
+ category='s2s',
57
+ modalities=['text'],
58
+ eval_splits=['validation', 'test'],
59
+ eval_langs=['cmn-Hans'],
60
+ main_score='max_accuracy',
58
61
  date=None,
59
62
  domains=None,
60
63
  task_subtypes=None,
@@ -105,9 +108,12 @@ class Cmnli(AbsTaskPairClassification):
105
108
  doi = "10.18653/v1/2020.coling-main.419",
106
109
  pages = "4762--4772",
107
110
  }""",
108
- descriptive_stats={"n_samples": None, "avg_character_length": None},
111
+ descriptive_stats={
112
+ 'n_samples': None,
113
+ 'avg_character_length': None
114
+ },
109
115
  )
110
116
 
111
117
  def dataset_transform(self):
112
- self.dataset = self.dataset.rename_column("sent1", "sentence1")
113
- self.dataset = self.dataset.rename_column("sent2", "sentence2")
118
+ self.dataset = self.dataset.rename_column('sent1', 'sentence1')
119
+ self.dataset = self.dataset.rename_column('sent2', 'sentence2')
@@ -33,7 +33,10 @@ class T2Reranking(AbsTaskReranking):
33
33
  archivePrefix={arXiv},
34
34
  primaryClass={cs.IR}
35
35
  }""", # noqa
36
- descriptive_stats={'n_samples': None, 'avg_character_length': None},
36
+ descriptive_stats={
37
+ 'n_samples': None,
38
+ 'avg_character_length': None
39
+ },
37
40
  )
38
41
 
39
42
 
@@ -68,7 +71,10 @@ class MMarcoReranking(AbsTaskReranking):
68
71
  archivePrefix={arXiv},
69
72
  primaryClass={cs.CL}
70
73
  }""", # noqa
71
- descriptive_stats={'n_samples': None, 'avg_character_length': None},
74
+ descriptive_stats={
75
+ 'n_samples': None,
76
+ 'avg_character_length': None
77
+ },
72
78
  )
73
79
 
74
80
 
@@ -105,8 +111,12 @@ class CMedQAv1(AbsTaskReranking):
105
111
  publisher={Multidisciplinary Digital Publishing Institute}
106
112
  }""",
107
113
  descriptive_stats={
108
- 'n_samples': {'test': 2000},
109
- 'avg_character_length': {'test': 165},
114
+ 'n_samples': {
115
+ 'test': 2000
116
+ },
117
+ 'avg_character_length': {
118
+ 'test': 165
119
+ },
110
120
  },
111
121
  )
112
122
 
@@ -146,5 +156,8 @@ keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extr
146
156
  doi={10.1109/ACCESS.2018.2883637},
147
157
  ISSN={2169-3536},
148
158
  month={},}""", # noqa
149
- descriptive_stats={'n_samples': None, 'avg_character_length': None},
159
+ descriptive_stats={
160
+ 'n_samples': None,
161
+ 'avg_character_length': None
162
+ },
150
163
  )