evalscope 0.6.0__tar.gz → 0.6.0rc0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (217) hide show
  1. {evalscope-0.6.0 → evalscope-0.6.0rc0}/PKG-INFO +9 -9
  2. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/tasks/eval_datasets.py +2 -1
  3. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +96 -96
  4. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +70 -71
  5. evalscope-0.6.0rc0/evalscope/version.py +4 -0
  6. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/PKG-INFO +9 -9
  7. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/requires.txt +8 -8
  8. evalscope-0.6.0/evalscope/version.py +0 -4
  9. {evalscope-0.6.0 → evalscope-0.6.0rc0}/README.md +0 -0
  10. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/__init__.py +0 -0
  11. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/__init__.py +0 -0
  12. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/base.py +0 -0
  13. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/__init__.py +0 -0
  14. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  15. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  16. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  17. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  18. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/__init__.py +0 -0
  19. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  20. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  21. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  22. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  23. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  24. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  25. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  26. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  27. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  28. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  29. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  30. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  31. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  32. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  33. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  34. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  35. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  36. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  37. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  38. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  39. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  40. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -0
  41. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -0
  42. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -0
  43. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  44. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  45. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  46. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  47. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  48. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  49. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  50. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/__init__.py +0 -0
  51. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/arc/__init__.py +0 -0
  52. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  53. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  54. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  55. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  56. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  57. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  58. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  59. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  60. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  61. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  62. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  63. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  64. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  65. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  66. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  67. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  68. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  69. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  70. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  71. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  72. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  73. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  74. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  75. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  76. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  77. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  78. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  79. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  80. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  81. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  82. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  83. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/benchmark.py +0 -0
  84. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  85. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  86. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  87. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  88. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  89. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  90. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  91. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  92. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  93. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/data_adapter.py +0 -0
  94. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  95. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
  96. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  97. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  98. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  99. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  100. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  101. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  102. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  103. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  104. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  105. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  106. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  107. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  108. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/race/__init__.py +0 -0
  109. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/race/race.py +0 -0
  110. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/race/race_adapter.py +0 -0
  111. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  112. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  113. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  114. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  115. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  116. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  117. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cache.py +0 -0
  118. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/__init__.py +0 -0
  119. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/base.py +0 -0
  120. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/cli.py +0 -0
  121. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/start_perf.py +0 -0
  122. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/start_server.py +0 -0
  123. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/config.py +0 -0
  124. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/constants.py +0 -0
  125. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/__init__.py +0 -0
  126. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/evaluator.py +0 -0
  127. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/rating_eval.py +0 -0
  128. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  129. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  130. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/__init__.py +0 -0
  131. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  132. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  133. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/code_metric.py +0 -0
  134. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/math_accuracy.py +0 -0
  135. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/metrics.py +0 -0
  136. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/rouge_metric.py +0 -0
  137. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/__init__.py +0 -0
  138. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/api/__init__.py +0 -0
  139. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/api/openai_api.py +0 -0
  140. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/custom/__init__.py +0 -0
  141. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/custom/custom_model.py +0 -0
  142. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/dummy_chat_model.py +0 -0
  143. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/model.py +0 -0
  144. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/model_adapter.py +0 -0
  145. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/openai_model.py +0 -0
  146. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/template.py +0 -0
  147. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/__init__.py +0 -0
  148. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/_logging.py +0 -0
  149. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/api_plugin_base.py +0 -0
  150. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/custom_api.py +0 -0
  151. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/dashscope_api.py +0 -0
  152. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/dataset_plugin_base.py +0 -0
  153. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/datasets/__init__.py +0 -0
  154. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/datasets/line_by_line.py +0 -0
  155. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
  156. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/datasets/openqa.py +0 -0
  157. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/how_to_analysis_result.py +0 -0
  158. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/http_client.py +0 -0
  159. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/openai_api.py +0 -0
  160. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/plugin_registry.py +0 -0
  161. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/query_parameters.py +0 -0
  162. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/server_sent_event.py +0 -0
  163. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/preprocess/__init__.py +0 -0
  164. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/preprocess/tokenizers/__init__.py +0 -0
  165. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
  166. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/__init__.py +0 -0
  167. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/arc.yaml +0 -0
  168. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/bbh.yaml +0 -0
  169. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  170. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/ceval.yaml +0 -0
  171. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  172. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  173. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  174. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  175. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  176. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  177. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  178. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/run.py +0 -0
  179. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/run_arena.py +0 -0
  180. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/run_ms.py +0 -0
  181. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/summarizer.py +0 -0
  182. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/__init__.py +0 -0
  183. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  184. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/eval.py +0 -0
  185. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/infer.py +0 -0
  186. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  187. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  188. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  189. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  190. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  191. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  192. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  193. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  194. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/utils.py +0 -0
  195. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  196. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  197. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  198. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  199. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  200. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  201. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/tools/__init__.py +0 -0
  202. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/tools/combine_reports.py +0 -0
  203. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
  204. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/tools/rewrite_eval_results.py +0 -0
  205. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/__init__.py +0 -0
  206. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/arena_utils.py +0 -0
  207. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/completion_parsers.py +0 -0
  208. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/logger.py +0 -0
  209. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/task_cfg_parser.py +0 -0
  210. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/task_utils.py +0 -0
  211. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/utils.py +0 -0
  212. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/SOURCES.txt +0 -0
  213. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/dependency_links.txt +0 -0
  214. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/entry_points.txt +0 -0
  215. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/not-zip-safe +0 -0
  216. {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/top_level.txt +0 -0
  217. {evalscope-0.6.0 → evalscope-0.6.0rc0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.6.0
3
+ Version: 0.6.0rc0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -28,7 +28,7 @@ Requires-Dist: nltk>=3.9
28
28
  Requires-Dist: openai
29
29
  Requires-Dist: pandas
30
30
  Requires-Dist: plotly
31
- Requires-Dist: pyarrow
31
+ Requires-Dist: pyarrow<=17.0.0
32
32
  Requires-Dist: pympler
33
33
  Requires-Dist: pyyaml
34
34
  Requires-Dist: regex
@@ -48,12 +48,12 @@ Requires-Dist: transformers_stream_generator
48
48
  Requires-Dist: jieba
49
49
  Requires-Dist: rouge-chinese
50
50
  Provides-Extra: opencompass
51
- Requires-Dist: ms-opencompass>=0.1.1; extra == "opencompass"
51
+ Requires-Dist: ms-opencompass>=0.1.3; extra == "opencompass"
52
52
  Provides-Extra: vlmeval
53
53
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
54
54
  Provides-Extra: rag
55
- Requires-Dist: mteb>=0.14.16; extra == "rag"
56
- Requires-Dist: ragas<0.3,>=0.2.3; extra == "rag"
55
+ Requires-Dist: mteb==1.19.4; extra == "rag"
56
+ Requires-Dist: ragas==0.2.3; extra == "rag"
57
57
  Requires-Dist: webdataset>0.2.0; extra == "rag"
58
58
  Provides-Extra: inner
59
59
  Requires-Dist: absl-py; extra == "inner"
@@ -96,7 +96,7 @@ Requires-Dist: nltk>=3.9; extra == "all"
96
96
  Requires-Dist: openai; extra == "all"
97
97
  Requires-Dist: pandas; extra == "all"
98
98
  Requires-Dist: plotly; extra == "all"
99
- Requires-Dist: pyarrow; extra == "all"
99
+ Requires-Dist: pyarrow<=17.0.0; extra == "all"
100
100
  Requires-Dist: pympler; extra == "all"
101
101
  Requires-Dist: pyyaml; extra == "all"
102
102
  Requires-Dist: regex; extra == "all"
@@ -115,10 +115,10 @@ Requires-Dist: transformers>=4.33; extra == "all"
115
115
  Requires-Dist: transformers_stream_generator; extra == "all"
116
116
  Requires-Dist: jieba; extra == "all"
117
117
  Requires-Dist: rouge-chinese; extra == "all"
118
- Requires-Dist: ms-opencompass>=0.1.1; extra == "all"
118
+ Requires-Dist: ms-opencompass>=0.1.3; extra == "all"
119
119
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
120
- Requires-Dist: mteb>=0.14.16; extra == "all"
121
- Requires-Dist: ragas<0.3,>=0.2.3; extra == "all"
120
+ Requires-Dist: mteb==1.19.4; extra == "all"
121
+ Requires-Dist: ragas==0.2.3; extra == "all"
122
122
  Requires-Dist: webdataset>0.2.0; extra == "all"
123
123
 
124
124
 
@@ -50,12 +50,13 @@ with read_base():
50
50
  from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
51
51
  from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
52
52
  from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
53
+ from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
54
+ from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
53
55
 
54
56
  # Note: to be supported
55
57
  # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
56
58
  # from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
57
59
  # from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
58
- # from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
59
60
 
60
61
 
61
62
  datasets = []
@@ -17,57 +17,57 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
17
17
  max_fraction_of_documents_to_embed = None
18
18
 
19
19
  metadata = TaskMetadata(
20
- name="CLSClusteringS2S",
21
- description="Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.",
22
- reference="https://arxiv.org/abs/2209.05034",
20
+ name='CLSClusteringS2S',
21
+ description='Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.',
22
+ reference='https://arxiv.org/abs/2209.05034',
23
23
  dataset={
24
- "path": "C-MTEB/CLSClusteringS2S",
25
- "revision": "e458b3f5414b62b7f9f83499ac1f5497ae2e869f",
24
+ 'path': 'C-MTEB/CLSClusteringS2S',
25
+ 'revision': 'e458b3f5414b62b7f9f83499ac1f5497ae2e869f',
26
26
  },
27
- type="Clustering",
28
- category="s2s",
29
- modalities=["text"],
30
- eval_splits=["test"],
31
- eval_langs=["cmn-Hans"],
32
- main_score="v_measure",
33
- date=("2022-01-01", "2022-09-12"),
34
- domains=["Academic", "Written"],
35
- task_subtypes=["Thematic clustering", "Topic classification"],
36
- license="Apache-2.0",
37
- annotations_creators="derived",
27
+ type='Clustering',
28
+ category='s2s',
29
+ modalities=['text'],
30
+ eval_splits=['test'],
31
+ eval_langs=['cmn-Hans'],
32
+ main_score='v_measure',
33
+ date=('2022-01-01', '2022-09-12'),
34
+ domains=['Academic', 'Written'],
35
+ task_subtypes=['Thematic clustering', 'Topic classification'],
36
+ license='apache-2.0',
37
+ annotations_creators='derived',
38
38
  dialect=[],
39
- sample_creation="found",
39
+ sample_creation='found',
40
40
  bibtex_citation="""@misc{li2022csl,
41
- title={CSL: A Large-scale Chinese Scientific Literature Dataset},
41
+ title={CSL: A Large-scale Chinese Scientific Literature Dataset},
42
42
  author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
43
43
  year={2022},
44
44
  eprint={2209.05034},
45
45
  archivePrefix={arXiv},
46
46
  primaryClass={cs.CL}
47
- }""",
47
+ }""", # noqa
48
48
  descriptive_stats={
49
- "n_samples": {"test": NUM_SAMPLES},
50
- "avg_character_length": {},
49
+ 'n_samples': {'test': NUM_SAMPLES},
50
+ 'avg_character_length': {},
51
51
  },
52
52
  )
53
53
 
54
54
  def dataset_transform(self):
55
55
  ds = {}
56
56
  for split in self.metadata.eval_splits:
57
- labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
57
+ labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
58
58
  sentences = list(
59
- itertools.chain.from_iterable(self.dataset[split]["sentences"])
59
+ itertools.chain.from_iterable(self.dataset[split]['sentences'])
60
60
  )
61
61
 
62
62
  check_label_distribution(self.dataset[split])
63
63
 
64
- ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
64
+ ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
65
65
  self.dataset = DatasetDict(ds)
66
66
  self.dataset = self.stratified_subsampling(
67
67
  self.dataset,
68
68
  self.seed,
69
69
  self.metadata.eval_splits,
70
- label="labels",
70
+ label='labels',
71
71
  n_samples=NUM_SAMPLES,
72
72
  )
73
73
 
@@ -77,57 +77,57 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
77
77
  max_fraction_of_documents_to_embed = None
78
78
 
79
79
  metadata = TaskMetadata(
80
- name="CLSClusteringP2P",
81
- description="Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.",
82
- reference="https://arxiv.org/abs/2209.05034",
80
+ name='CLSClusteringP2P',
81
+ description='Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.',
82
+ reference='https://arxiv.org/abs/2209.05034',
83
83
  dataset={
84
- "path": "C-MTEB/CLSClusteringP2P",
85
- "revision": "4b6227591c6c1a73bc76b1055f3b7f3588e72476",
84
+ 'path': 'C-MTEB/CLSClusteringP2P',
85
+ 'revision': '4b6227591c6c1a73bc76b1055f3b7f3588e72476',
86
86
  },
87
- type="Clustering",
88
- category="p2p",
89
- modalities=["text"],
90
- eval_splits=["test"],
91
- eval_langs=["cmn-Hans"],
92
- main_score="v_measure",
93
- date=("2022-01-01", "2022-09-12"),
94
- domains=["Academic", "Written"],
95
- task_subtypes=["Thematic clustering", "Topic classification"],
96
- license="Apache-2.0",
97
- annotations_creators="derived",
87
+ type='Clustering',
88
+ category='p2p',
89
+ modalities=['text'],
90
+ eval_splits=['test'],
91
+ eval_langs=['cmn-Hans'],
92
+ main_score='v_measure',
93
+ date=('2022-01-01', '2022-09-12'),
94
+ domains=['Academic', 'Written'],
95
+ task_subtypes=['Thematic clustering', 'Topic classification'],
96
+ license='apache-2.0',
97
+ annotations_creators='derived',
98
98
  dialect=[],
99
- sample_creation="found",
99
+ sample_creation='found',
100
100
  bibtex_citation="""@misc{li2022csl,
101
- title={CSL: A Large-scale Chinese Scientific Literature Dataset},
101
+ title={CSL: A Large-scale Chinese Scientific Literature Dataset},
102
102
  author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
103
103
  year={2022},
104
104
  eprint={2209.05034},
105
105
  archivePrefix={arXiv},
106
106
  primaryClass={cs.CL}
107
- }""",
107
+ }""", # noqa
108
108
  descriptive_stats={
109
- "n_samples": {"test": NUM_SAMPLES},
110
- "avg_character_length": {},
109
+ 'n_samples': {'test': NUM_SAMPLES},
110
+ 'avg_character_length': {},
111
111
  },
112
112
  )
113
113
 
114
114
  def dataset_transform(self):
115
115
  ds = {}
116
116
  for split in self.metadata.eval_splits:
117
- labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
117
+ labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
118
118
  sentences = list(
119
- itertools.chain.from_iterable(self.dataset[split]["sentences"])
119
+ itertools.chain.from_iterable(self.dataset[split]['sentences'])
120
120
  )
121
121
 
122
122
  check_label_distribution(self.dataset[split])
123
123
 
124
- ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
124
+ ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
125
125
  self.dataset = DatasetDict(ds)
126
126
  self.dataset = self.stratified_subsampling(
127
127
  self.dataset,
128
128
  self.seed,
129
129
  self.metadata.eval_splits,
130
- label="labels",
130
+ label='labels',
131
131
  n_samples=NUM_SAMPLES,
132
132
  )
133
133
 
@@ -137,26 +137,26 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
137
137
  max_fraction_of_documents_to_embed = None
138
138
 
139
139
  metadata = TaskMetadata(
140
- name="ThuNewsClusteringS2S",
140
+ name='ThuNewsClusteringS2S',
141
141
  dataset={
142
- "path": "C-MTEB/ThuNewsClusteringS2S",
143
- "revision": "8a8b2caeda43f39e13c4bc5bea0f8a667896e10d",
142
+ 'path': 'C-MTEB/ThuNewsClusteringS2S',
143
+ 'revision': '8a8b2caeda43f39e13c4bc5bea0f8a667896e10d',
144
144
  },
145
- description="Clustering of titles from the THUCNews dataset",
146
- reference="http://thuctc.thunlp.org/",
147
- type="Clustering",
148
- category="s2s",
149
- modalities=["text"],
150
- eval_splits=["test"],
151
- eval_langs=["cmn-Hans"],
152
- main_score="v_measure",
153
- date=("2006-01-01", "2007-01-01"),
154
- domains=["News", "Written"],
155
- task_subtypes=["Thematic clustering", "Topic classification"],
156
- license="Not specified",
157
- annotations_creators="derived",
145
+ description='Clustering of titles from the THUCNews dataset',
146
+ reference='http://thuctc.thunlp.org/',
147
+ type='Clustering',
148
+ category='s2s',
149
+ modalities=['text'],
150
+ eval_splits=['test'],
151
+ eval_langs=['cmn-Hans'],
152
+ main_score='v_measure',
153
+ date=('2006-01-01', '2007-01-01'),
154
+ domains=['News', 'Written'],
155
+ task_subtypes=['Thematic clustering', 'Topic classification'],
156
+ license='apache-2.0',
157
+ annotations_creators='derived',
158
158
  dialect=[],
159
- sample_creation="found",
159
+ sample_creation='found',
160
160
  bibtex_citation="""@software{THUCTC,
161
161
  author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
162
162
  title = {THUCTC: An Efficient Chinese Text Classifier},
@@ -166,28 +166,28 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
166
166
  url = {https://github.com/thunlp/THUCTC}
167
167
  }""",
168
168
  descriptive_stats={
169
- "n_samples": {"test": NUM_SAMPLES},
170
- "avg_character_length": {},
169
+ 'n_samples': {'test': NUM_SAMPLES},
170
+ 'avg_character_length': {},
171
171
  },
172
172
  )
173
173
 
174
174
  def dataset_transform(self):
175
175
  ds = {}
176
176
  for split in self.metadata.eval_splits:
177
- labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
177
+ labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
178
178
  sentences = list(
179
- itertools.chain.from_iterable(self.dataset[split]["sentences"])
179
+ itertools.chain.from_iterable(self.dataset[split]['sentences'])
180
180
  )
181
181
 
182
182
  check_label_distribution(self.dataset[split])
183
183
 
184
- ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
184
+ ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
185
185
  self.dataset = DatasetDict(ds)
186
186
  self.dataset = self.stratified_subsampling(
187
187
  self.dataset,
188
188
  self.seed,
189
189
  self.metadata.eval_splits,
190
- label="labels",
190
+ label='labels',
191
191
  n_samples=NUM_SAMPLES,
192
192
  )
193
193
 
@@ -197,26 +197,26 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
197
197
  max_fraction_of_documents_to_embed = None
198
198
 
199
199
  metadata = TaskMetadata(
200
- name="ThuNewsClusteringP2P",
200
+ name='ThuNewsClusteringP2P',
201
201
  dataset={
202
- "path": "C-MTEB/ThuNewsClusteringP2P",
203
- "revision": "5798586b105c0434e4f0fe5e767abe619442cf93",
202
+ 'path': 'C-MTEB/ThuNewsClusteringP2P',
203
+ 'revision': '5798586b105c0434e4f0fe5e767abe619442cf93',
204
204
  },
205
- description="Clustering of titles + abstracts from the THUCNews dataset",
206
- reference="http://thuctc.thunlp.org/",
207
- type="Clustering",
208
- category="p2p",
209
- modalities=["text"],
210
- eval_splits=["test"],
211
- eval_langs=["cmn-Hans"],
212
- main_score="v_measure",
213
- date=("2006-01-01", "2007-01-01"),
214
- domains=["News", "Written"],
215
- task_subtypes=["Thematic clustering", "Topic classification"],
216
- license="Not specified",
217
- annotations_creators="derived",
205
+ description='Clustering of titles + abstracts from the THUCNews dataset',
206
+ reference='http://thuctc.thunlp.org/',
207
+ type='Clustering',
208
+ category='p2p',
209
+ modalities=['text'],
210
+ eval_splits=['test'],
211
+ eval_langs=['cmn-Hans'],
212
+ main_score='v_measure',
213
+ date=('2006-01-01', '2007-01-01'),
214
+ domains=['News', 'Written'],
215
+ task_subtypes=['Thematic clustering', 'Topic classification'],
216
+ license='apache-2.0',
217
+ annotations_creators='derived',
218
218
  dialect=[],
219
- sample_creation="found",
219
+ sample_creation='found',
220
220
  bibtex_citation="""@software{THUCTC,
221
221
  author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
222
222
  title = {THUCTC: An Efficient Chinese Text Classifier},
@@ -226,27 +226,27 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
226
226
  url = {https://github.com/thunlp/THUCTC}
227
227
  }""",
228
228
  descriptive_stats={
229
- "n_samples": {"test": NUM_SAMPLES},
230
- "avg_character_length": {},
229
+ 'n_samples': {'test': NUM_SAMPLES},
230
+ 'avg_character_length': {},
231
231
  },
232
232
  )
233
233
 
234
234
  def dataset_transform(self):
235
235
  ds = {}
236
236
  for split in self.metadata.eval_splits:
237
- labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
237
+ labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
238
238
  sentences = list(
239
- itertools.chain.from_iterable(self.dataset[split]["sentences"])
239
+ itertools.chain.from_iterable(self.dataset[split]['sentences'])
240
240
  )
241
241
 
242
242
  check_label_distribution(self.dataset[split])
243
243
 
244
- ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
244
+ ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
245
245
  self.dataset = DatasetDict(ds)
246
246
  self.dataset = self.stratified_subsampling(
247
247
  self.dataset,
248
248
  self.seed,
249
249
  self.metadata.eval_splits,
250
- label="labels",
250
+ label='labels',
251
251
  n_samples=NUM_SAMPLES,
252
252
  )
@@ -2,22 +2,21 @@ from mteb.abstasks.AbsTaskReranking import AbsTaskReranking
2
2
  from mteb.abstasks.TaskMetadata import HFSubset, TaskMetadata
3
3
 
4
4
 
5
-
6
5
  class T2Reranking(AbsTaskReranking):
7
6
  metadata = TaskMetadata(
8
- name="T2Reranking",
9
- description="T2Ranking: A large-scale Chinese Benchmark for Passage Ranking",
10
- reference="https://arxiv.org/abs/2304.03679",
7
+ name='T2Reranking',
8
+ description='T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
9
+ reference='https://arxiv.org/abs/2304.03679',
11
10
  dataset={
12
- "path": "C-MTEB/T2Reranking",
13
- "revision": "76631901a18387f85eaa53e5450019b87ad58ef9",
11
+ 'path': 'C-MTEB/T2Reranking',
12
+ 'revision': '76631901a18387f85eaa53e5450019b87ad58ef9',
14
13
  },
15
- type="Reranking",
16
- category="s2s",
17
- modalities=["text"],
18
- eval_splits=["dev"],
19
- eval_langs=["cmn-Hans"],
20
- main_score="map",
14
+ type='Reranking',
15
+ category='s2s',
16
+ modalities=['text'],
17
+ eval_splits=['dev'],
18
+ eval_langs=['cmn-Hans'],
19
+ main_score='map',
21
20
  date=None,
22
21
  form=None,
23
22
  domains=None,
@@ -27,32 +26,32 @@ class T2Reranking(AbsTaskReranking):
27
26
  dialect=None,
28
27
  sample_creation=None,
29
28
  bibtex_citation="""@misc{xie2023t2ranking,
30
- title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
29
+ title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
31
30
  author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma},
32
31
  year={2023},
33
32
  eprint={2304.03679},
34
33
  archivePrefix={arXiv},
35
34
  primaryClass={cs.IR}
36
- }""",
37
- descriptive_stats={"n_samples": None, "avg_character_length": None},
35
+ }""", # noqa
36
+ descriptive_stats={'n_samples': None, 'avg_character_length': None},
38
37
  )
39
38
 
40
39
 
41
40
  class MMarcoReranking(AbsTaskReranking):
42
41
  metadata = TaskMetadata(
43
- name="MMarcoReranking",
44
- description="mMARCO is a multilingual version of the MS MARCO passage ranking dataset",
45
- reference="https://github.com/unicamp-dl/mMARCO",
42
+ name='MMarcoReranking',
43
+ description='mMARCO is a multilingual version of the MS MARCO passage ranking dataset',
44
+ reference='https://github.com/unicamp-dl/mMARCO',
46
45
  dataset={
47
- "path": "C-MTEB/Mmarco-reranking",
48
- "revision": "8e0c766dbe9e16e1d221116a3f36795fbade07f6",
46
+ 'path': 'C-MTEB/Mmarco-reranking',
47
+ 'revision': '8e0c766dbe9e16e1d221116a3f36795fbade07f6',
49
48
  },
50
- type="Reranking",
51
- category="s2s",
52
- modalities=["text"],
53
- eval_splits=["dev"],
54
- eval_langs=["cmn-Hans"],
55
- main_score="map",
49
+ type='Reranking',
50
+ category='s2s',
51
+ modalities=['text'],
52
+ eval_splits=['dev'],
53
+ eval_langs=['cmn-Hans'],
54
+ main_score='map',
56
55
  date=None,
57
56
  form=None,
58
57
  domains=None,
@@ -62,39 +61,39 @@ class MMarcoReranking(AbsTaskReranking):
62
61
  dialect=None,
63
62
  sample_creation=None,
64
63
  bibtex_citation="""@misc{bonifacio2021mmarco,
65
- title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset},
64
+ title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset},
66
65
  author={Luiz Henrique Bonifacio and Vitor Jeronymo and Hugo Queiroz Abonizio and Israel Campiotti and Marzieh Fadaee and and Roberto Lotufo and Rodrigo Nogueira},
67
66
  year={2021},
68
67
  eprint={2108.13897},
69
68
  archivePrefix={arXiv},
70
69
  primaryClass={cs.CL}
71
- }""",
72
- descriptive_stats={"n_samples": None, "avg_character_length": None},
70
+ }""", # noqa
71
+ descriptive_stats={'n_samples': None, 'avg_character_length': None},
73
72
  )
74
73
 
75
74
 
76
75
  class CMedQAv1(AbsTaskReranking):
77
76
  metadata = TaskMetadata(
78
- name="CMedQAv1",
79
- description="Chinese community medical question answering",
80
- reference="https://github.com/zhangsheng93/cMedQA",
77
+ name='CMedQAv1',
78
+ description='Chinese community medical question answering',
79
+ reference='https://github.com/zhangsheng93/cMedQA',
81
80
  dataset={
82
- "path": "C-MTEB/CMedQAv1-reranking",
83
- "revision": "8d7f1e942507dac42dc58017c1a001c3717da7df",
81
+ 'path': 'C-MTEB/CMedQAv1-reranking',
82
+ 'revision': '8d7f1e942507dac42dc58017c1a001c3717da7df',
84
83
  },
85
- type="Reranking",
86
- category="s2s",
87
- modalities=["text"],
88
- eval_splits=["test"],
89
- eval_langs=["cmn-Hans"],
90
- main_score="map",
91
- date=("2017-01-01", "2017-07-26"),
92
- domains=["Medical", "Written"],
84
+ type='Reranking',
85
+ category='s2s',
86
+ modalities=['text'],
87
+ eval_splits=['test'],
88
+ eval_langs=['cmn-Hans'],
89
+ main_score='map',
90
+ date=('2017-01-01', '2017-07-26'),
91
+ domains=['Medical', 'Written'],
93
92
  task_subtypes=[],
94
- license="Not specified",
95
- annotations_creators="expert-annotated",
93
+ license='apache-2.0',
94
+ annotations_creators='expert-annotated',
96
95
  dialect=[],
97
- sample_creation="found",
96
+ sample_creation='found',
98
97
  bibtex_citation="""@article{zhang2017chinese,
99
98
  title={Chinese Medical Question Answer Matching Using End-to-End Character-Level Multi-Scale CNNs},
100
99
  author={Zhang, Sheng and Zhang, Xin and Wang, Hui and Cheng, Jiajun and Li, Pei and Ding, Zhaoyun},
@@ -106,27 +105,27 @@ class CMedQAv1(AbsTaskReranking):
106
105
  publisher={Multidisciplinary Digital Publishing Institute}
107
106
  }""",
108
107
  descriptive_stats={
109
- "n_samples": {"test": 2000},
110
- "avg_character_length": {"test": 165},
108
+ 'n_samples': {'test': 2000},
109
+ 'avg_character_length': {'test': 165},
111
110
  },
112
111
  )
113
112
 
114
113
 
115
114
  class CMedQAv2(AbsTaskReranking):
116
115
  metadata = TaskMetadata(
117
- name="CMedQAv2",
118
- description="Chinese community medical question answering",
119
- reference="https://github.com/zhangsheng93/cMedQA2",
116
+ name='CMedQAv2',
117
+ description='Chinese community medical question answering',
118
+ reference='https://github.com/zhangsheng93/cMedQA2',
120
119
  dataset={
121
- "path": "C-MTEB/CMedQAv2-reranking",
122
- "revision": "23d186750531a14a0357ca22cd92d712fd512ea0",
120
+ 'path': 'C-MTEB/CMedQAv2-reranking',
121
+ 'revision': '23d186750531a14a0357ca22cd92d712fd512ea0',
123
122
  },
124
- type="Reranking",
125
- category="s2s",
126
- modalities=["text"],
127
- eval_splits=["test"],
128
- eval_langs=["cmn-Hans"],
129
- main_score="map",
123
+ type='Reranking',
124
+ category='s2s',
125
+ modalities=['text'],
126
+ eval_splits=['test'],
127
+ eval_langs=['cmn-Hans'],
128
+ main_score='map',
130
129
  date=None,
131
130
  form=None,
132
131
  domains=None,
@@ -135,17 +134,17 @@ class CMedQAv2(AbsTaskReranking):
135
134
  annotations_creators=None,
136
135
  dialect=None,
137
136
  sample_creation=None,
138
- bibtex_citation="""@ARTICLE{8548603,
139
- author={S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu},
140
- journal={IEEE Access},
141
- title={Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection},
142
- year={2018},
143
- volume={6},
144
- number={},
145
- pages={74061-74071},
146
- keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks},
147
- doi={10.1109/ACCESS.2018.2883637},
148
- ISSN={2169-3536},
149
- month={},}""",
150
- descriptive_stats={"n_samples": None, "avg_character_length": None},
137
+ bibtex_citation="""@ARTICLE{8548603,
138
+ author={S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu},
139
+ journal={IEEE Access},
140
+ title={Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection},
141
+ year={2018},
142
+ volume={6},
143
+ number={},
144
+ pages={74061-74071},
145
+ keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks},
146
+ doi={10.1109/ACCESS.2018.2883637},
147
+ ISSN={2169-3536},
148
+ month={},}""", # noqa
149
+ descriptive_stats={'n_samples': None, 'avg_character_length': None},
151
150
  )
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ __version__ = "0.6.0rc0"
4
+ __release_datetime__ = "2099-01-01 00:00:00"