evalscope 0.6.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (222) hide show
  1. {evalscope-0.6.0 → evalscope-0.6.1}/PKG-INFO +14 -13
  2. {evalscope-0.6.0 → evalscope-0.6.1}/README.md +6 -5
  3. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
  4. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +96 -96
  5. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +70 -71
  6. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
  7. evalscope-0.6.1/evalscope/backend/rag_eval/utils/clip.py +149 -0
  8. evalscope-0.6.1/evalscope/backend/rag_eval/utils/embedding.py +183 -0
  9. evalscope-0.6.1/evalscope/backend/rag_eval/utils/llm.py +72 -0
  10. evalscope-0.6.1/evalscope/backend/rag_eval/utils/tools.py +63 -0
  11. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  12. evalscope-0.6.1/evalscope/preprocess/tokenizers/__init__.py +0 -0
  13. evalscope-0.6.1/evalscope/version.py +4 -0
  14. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/PKG-INFO +14 -13
  15. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/SOURCES.txt +5 -0
  16. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/requires.txt +8 -8
  17. evalscope-0.6.0/evalscope/version.py +0 -4
  18. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/__init__.py +0 -0
  19. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/__init__.py +0 -0
  20. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/base.py +0 -0
  21. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/__init__.py +0 -0
  22. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  23. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
  24. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  25. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  26. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/__init__.py +0 -0
  27. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  28. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  29. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  30. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  31. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  32. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  33. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  34. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  35. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  36. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  37. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  38. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  39. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  40. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  41. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  42. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  43. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  44. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  45. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  46. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  47. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  48. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -0
  49. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -0
  50. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -0
  51. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  52. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  53. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  54. {evalscope-0.6.0/evalscope/perf → evalscope-0.6.1/evalscope/backend/rag_eval/utils}/__init__.py +0 -0
  55. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  56. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  57. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  58. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/__init__.py +0 -0
  59. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/arc/__init__.py +0 -0
  60. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  61. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  62. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
  63. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  64. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  65. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  66. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  67. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  68. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  69. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  70. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  71. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  72. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  73. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  74. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  75. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  76. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  77. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  78. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  79. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  80. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  81. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  82. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  83. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  84. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  85. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  86. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  87. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  88. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  89. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  90. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  91. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/benchmark.py +0 -0
  92. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
  93. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  94. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  95. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  96. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  97. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  98. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  99. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  100. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  101. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/data_adapter.py +0 -0
  102. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  103. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
  104. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  105. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  106. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  107. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  108. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  109. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  110. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  111. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  112. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  113. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  114. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  115. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  116. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/race/__init__.py +0 -0
  117. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/race/race.py +0 -0
  118. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
  119. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  120. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  121. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  122. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  123. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  124. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  125. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cache.py +0 -0
  126. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/__init__.py +0 -0
  127. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/base.py +0 -0
  128. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/cli.py +0 -0
  129. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/start_perf.py +0 -0
  130. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/start_server.py +0 -0
  131. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/config.py +0 -0
  132. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/constants.py +0 -0
  133. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/__init__.py +0 -0
  134. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/evaluator.py +0 -0
  135. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/rating_eval.py +0 -0
  136. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
  137. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  138. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/__init__.py +0 -0
  139. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  140. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/code_metric.py +0 -0
  141. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/math_accuracy.py +0 -0
  142. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/metrics.py +0 -0
  143. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/rouge_metric.py +0 -0
  144. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/__init__.py +0 -0
  145. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/api/__init__.py +0 -0
  146. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/api/openai_api.py +0 -0
  147. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/custom/__init__.py +0 -0
  148. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/custom/custom_model.py +0 -0
  149. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/dummy_chat_model.py +0 -0
  150. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/model.py +0 -0
  151. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/model_adapter.py +0 -0
  152. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/openai_model.py +0 -0
  153. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/template.py +0 -0
  154. {evalscope-0.6.0/evalscope/perf/datasets → evalscope-0.6.1/evalscope/perf}/__init__.py +0 -0
  155. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/_logging.py +0 -0
  156. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/api_plugin_base.py +0 -0
  157. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/custom_api.py +0 -0
  158. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/dashscope_api.py +0 -0
  159. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/dataset_plugin_base.py +0 -0
  160. {evalscope-0.6.0/evalscope/preprocess/tokenizers → evalscope-0.6.1/evalscope/perf/datasets}/__init__.py +0 -0
  161. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/datasets/line_by_line.py +0 -0
  162. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
  163. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/datasets/openqa.py +0 -0
  164. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/how_to_analysis_result.py +0 -0
  165. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/http_client.py +0 -0
  166. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/openai_api.py +0 -0
  167. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/plugin_registry.py +0 -0
  168. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/query_parameters.py +0 -0
  169. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/server_sent_event.py +0 -0
  170. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/preprocess/__init__.py +0 -0
  171. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
  172. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/__init__.py +0 -0
  173. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/arc.yaml +0 -0
  174. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/bbh.yaml +0 -0
  175. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  176. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/ceval.yaml +0 -0
  177. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  178. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  179. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  180. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
  181. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  182. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
  183. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  184. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/run.py +0 -0
  185. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/run_arena.py +0 -0
  186. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/run_ms.py +0 -0
  187. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/summarizer.py +0 -0
  188. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/__init__.py +0 -0
  189. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
  190. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/eval.py +0 -0
  191. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/infer.py +0 -0
  192. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  193. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  194. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  195. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  196. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  197. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  198. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  199. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  200. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/utils.py +0 -0
  201. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  202. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
  203. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
  204. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  205. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  206. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  207. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/tools/__init__.py +0 -0
  208. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/tools/combine_reports.py +0 -0
  209. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
  210. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/tools/rewrite_eval_results.py +0 -0
  211. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/__init__.py +0 -0
  212. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/arena_utils.py +0 -0
  213. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/completion_parsers.py +0 -0
  214. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/logger.py +0 -0
  215. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/task_cfg_parser.py +0 -0
  216. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/task_utils.py +0 -0
  217. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/utils.py +0 -0
  218. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/dependency_links.txt +0 -0
  219. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/entry_points.txt +0 -0
  220. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/not-zip-safe +0 -0
  221. {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/top_level.txt +0 -0
  222. {evalscope-0.6.0 → evalscope-0.6.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -28,7 +28,7 @@ Requires-Dist: nltk>=3.9
28
28
  Requires-Dist: openai
29
29
  Requires-Dist: pandas
30
30
  Requires-Dist: plotly
31
- Requires-Dist: pyarrow
31
+ Requires-Dist: pyarrow<=17.0.0
32
32
  Requires-Dist: pympler
33
33
  Requires-Dist: pyyaml
34
34
  Requires-Dist: regex
@@ -48,12 +48,12 @@ Requires-Dist: transformers_stream_generator
48
48
  Requires-Dist: jieba
49
49
  Requires-Dist: rouge-chinese
50
50
  Provides-Extra: opencompass
51
- Requires-Dist: ms-opencompass>=0.1.1; extra == "opencompass"
51
+ Requires-Dist: ms-opencompass>=0.1.3; extra == "opencompass"
52
52
  Provides-Extra: vlmeval
53
53
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
54
54
  Provides-Extra: rag
55
- Requires-Dist: mteb>=0.14.16; extra == "rag"
56
- Requires-Dist: ragas<0.3,>=0.2.3; extra == "rag"
55
+ Requires-Dist: mteb==1.19.4; extra == "rag"
56
+ Requires-Dist: ragas==0.2.5; extra == "rag"
57
57
  Requires-Dist: webdataset>0.2.0; extra == "rag"
58
58
  Provides-Extra: inner
59
59
  Requires-Dist: absl-py; extra == "inner"
@@ -96,7 +96,7 @@ Requires-Dist: nltk>=3.9; extra == "all"
96
96
  Requires-Dist: openai; extra == "all"
97
97
  Requires-Dist: pandas; extra == "all"
98
98
  Requires-Dist: plotly; extra == "all"
99
- Requires-Dist: pyarrow; extra == "all"
99
+ Requires-Dist: pyarrow<=17.0.0; extra == "all"
100
100
  Requires-Dist: pympler; extra == "all"
101
101
  Requires-Dist: pyyaml; extra == "all"
102
102
  Requires-Dist: regex; extra == "all"
@@ -115,10 +115,10 @@ Requires-Dist: transformers>=4.33; extra == "all"
115
115
  Requires-Dist: transformers_stream_generator; extra == "all"
116
116
  Requires-Dist: jieba; extra == "all"
117
117
  Requires-Dist: rouge-chinese; extra == "all"
118
- Requires-Dist: ms-opencompass>=0.1.1; extra == "all"
118
+ Requires-Dist: ms-opencompass>=0.1.3; extra == "all"
119
119
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
120
- Requires-Dist: mteb>=0.14.16; extra == "all"
121
- Requires-Dist: ragas<0.3,>=0.2.3; extra == "all"
120
+ Requires-Dist: mteb==1.19.4; extra == "all"
121
+ Requires-Dist: ragas==0.2.5; extra == "all"
122
122
  Requires-Dist: webdataset>0.2.0; extra == "all"
123
123
 
124
124
 
@@ -140,6 +140,7 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
140
140
  <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
141
141
  <p>
142
142
 
143
+ > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
143
144
 
144
145
  ## 📋 Table of Contents
145
146
  - [Introduction](#introduction)
@@ -165,7 +166,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
165
166
  The architecture includes the following modules:
166
167
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
167
168
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
168
- 3. **Evaluation Backend**:
169
+ 3. **Evaluation Backend**:
169
170
  - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
170
171
  - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
171
172
  - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
@@ -252,7 +253,7 @@ You can execute this command from any directory:
252
253
  python -m evalscope.run \
253
254
  --model qwen/Qwen2-0.5B-Instruct \
254
255
  --template-type qwen \
255
- --datasets arc
256
+ --datasets arc
256
257
  ```
257
258
 
258
259
  #### Install from source
@@ -359,13 +360,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
359
360
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
360
361
 
361
362
  ## Offline Evaluation
362
- You can use local dataset to evaluate the model without internet connection.
363
+ You can use local dataset to evaluate the model without internet connection.
363
364
 
364
365
  Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
365
366
 
366
367
 
367
368
  ## Arena Mode
368
- The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
369
+ The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
369
370
 
370
371
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
371
372
 
@@ -17,6 +17,7 @@
17
17
  <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
18
18
  <p>
19
19
 
20
+ > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
20
21
 
21
22
  ## 📋 Table of Contents
22
23
  - [Introduction](#introduction)
@@ -42,7 +43,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
42
43
  The architecture includes the following modules:
43
44
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
44
45
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
45
- 3. **Evaluation Backend**:
46
+ 3. **Evaluation Backend**:
46
47
  - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
47
48
  - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
48
49
  - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
@@ -129,7 +130,7 @@ You can execute this command from any directory:
129
130
  python -m evalscope.run \
130
131
  --model qwen/Qwen2-0.5B-Instruct \
131
132
  --template-type qwen \
132
- --datasets arc
133
+ --datasets arc
133
134
  ```
134
135
 
135
136
  #### Install from source
@@ -236,13 +237,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
236
237
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
237
238
 
238
239
  ## Offline Evaluation
239
- You can use local dataset to evaluate the model without internet connection.
240
+ You can use local dataset to evaluate the model without internet connection.
240
241
 
241
242
  Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
242
243
 
243
244
 
244
245
  ## Arena Mode
245
- The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
246
+ The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
246
247
 
247
248
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
248
249
 
@@ -270,4 +271,4 @@ Refer to : Model Serving Performance Evaluation [📖 User Guide](https://evalsc
270
271
 
271
272
  ## Star History
272
273
 
273
- [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
274
+ [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -50,6 +50,7 @@ with read_base():
50
50
  from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
51
51
  from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
52
52
  from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
53
+ from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
53
54
 
54
55
  # Note: to be supported
55
56
  # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
@@ -17,57 +17,57 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
17
17
  max_fraction_of_documents_to_embed = None
18
18
 
19
19
  metadata = TaskMetadata(
20
- name="CLSClusteringS2S",
21
- description="Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.",
22
- reference="https://arxiv.org/abs/2209.05034",
20
+ name='CLSClusteringS2S',
21
+ description='Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.',
22
+ reference='https://arxiv.org/abs/2209.05034',
23
23
  dataset={
24
- "path": "C-MTEB/CLSClusteringS2S",
25
- "revision": "e458b3f5414b62b7f9f83499ac1f5497ae2e869f",
24
+ 'path': 'C-MTEB/CLSClusteringS2S',
25
+ 'revision': 'e458b3f5414b62b7f9f83499ac1f5497ae2e869f',
26
26
  },
27
- type="Clustering",
28
- category="s2s",
29
- modalities=["text"],
30
- eval_splits=["test"],
31
- eval_langs=["cmn-Hans"],
32
- main_score="v_measure",
33
- date=("2022-01-01", "2022-09-12"),
34
- domains=["Academic", "Written"],
35
- task_subtypes=["Thematic clustering", "Topic classification"],
36
- license="Apache-2.0",
37
- annotations_creators="derived",
27
+ type='Clustering',
28
+ category='s2s',
29
+ modalities=['text'],
30
+ eval_splits=['test'],
31
+ eval_langs=['cmn-Hans'],
32
+ main_score='v_measure',
33
+ date=('2022-01-01', '2022-09-12'),
34
+ domains=['Academic', 'Written'],
35
+ task_subtypes=['Thematic clustering', 'Topic classification'],
36
+ license='apache-2.0',
37
+ annotations_creators='derived',
38
38
  dialect=[],
39
- sample_creation="found",
39
+ sample_creation='found',
40
40
  bibtex_citation="""@misc{li2022csl,
41
- title={CSL: A Large-scale Chinese Scientific Literature Dataset},
41
+ title={CSL: A Large-scale Chinese Scientific Literature Dataset},
42
42
  author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
43
43
  year={2022},
44
44
  eprint={2209.05034},
45
45
  archivePrefix={arXiv},
46
46
  primaryClass={cs.CL}
47
- }""",
47
+ }""", # noqa
48
48
  descriptive_stats={
49
- "n_samples": {"test": NUM_SAMPLES},
50
- "avg_character_length": {},
49
+ 'n_samples': {'test': NUM_SAMPLES},
50
+ 'avg_character_length': {},
51
51
  },
52
52
  )
53
53
 
54
54
  def dataset_transform(self):
55
55
  ds = {}
56
56
  for split in self.metadata.eval_splits:
57
- labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
57
+ labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
58
58
  sentences = list(
59
- itertools.chain.from_iterable(self.dataset[split]["sentences"])
59
+ itertools.chain.from_iterable(self.dataset[split]['sentences'])
60
60
  )
61
61
 
62
62
  check_label_distribution(self.dataset[split])
63
63
 
64
- ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
64
+ ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
65
65
  self.dataset = DatasetDict(ds)
66
66
  self.dataset = self.stratified_subsampling(
67
67
  self.dataset,
68
68
  self.seed,
69
69
  self.metadata.eval_splits,
70
- label="labels",
70
+ label='labels',
71
71
  n_samples=NUM_SAMPLES,
72
72
  )
73
73
 
@@ -77,57 +77,57 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
77
77
  max_fraction_of_documents_to_embed = None
78
78
 
79
79
  metadata = TaskMetadata(
80
- name="CLSClusteringP2P",
81
- description="Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.",
82
- reference="https://arxiv.org/abs/2209.05034",
80
+ name='CLSClusteringP2P',
81
+ description='Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.',
82
+ reference='https://arxiv.org/abs/2209.05034',
83
83
  dataset={
84
- "path": "C-MTEB/CLSClusteringP2P",
85
- "revision": "4b6227591c6c1a73bc76b1055f3b7f3588e72476",
84
+ 'path': 'C-MTEB/CLSClusteringP2P',
85
+ 'revision': '4b6227591c6c1a73bc76b1055f3b7f3588e72476',
86
86
  },
87
- type="Clustering",
88
- category="p2p",
89
- modalities=["text"],
90
- eval_splits=["test"],
91
- eval_langs=["cmn-Hans"],
92
- main_score="v_measure",
93
- date=("2022-01-01", "2022-09-12"),
94
- domains=["Academic", "Written"],
95
- task_subtypes=["Thematic clustering", "Topic classification"],
96
- license="Apache-2.0",
97
- annotations_creators="derived",
87
+ type='Clustering',
88
+ category='p2p',
89
+ modalities=['text'],
90
+ eval_splits=['test'],
91
+ eval_langs=['cmn-Hans'],
92
+ main_score='v_measure',
93
+ date=('2022-01-01', '2022-09-12'),
94
+ domains=['Academic', 'Written'],
95
+ task_subtypes=['Thematic clustering', 'Topic classification'],
96
+ license='apache-2.0',
97
+ annotations_creators='derived',
98
98
  dialect=[],
99
- sample_creation="found",
99
+ sample_creation='found',
100
100
  bibtex_citation="""@misc{li2022csl,
101
- title={CSL: A Large-scale Chinese Scientific Literature Dataset},
101
+ title={CSL: A Large-scale Chinese Scientific Literature Dataset},
102
102
  author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
103
103
  year={2022},
104
104
  eprint={2209.05034},
105
105
  archivePrefix={arXiv},
106
106
  primaryClass={cs.CL}
107
- }""",
107
+ }""", # noqa
108
108
  descriptive_stats={
109
- "n_samples": {"test": NUM_SAMPLES},
110
- "avg_character_length": {},
109
+ 'n_samples': {'test': NUM_SAMPLES},
110
+ 'avg_character_length': {},
111
111
  },
112
112
  )
113
113
 
114
114
  def dataset_transform(self):
115
115
  ds = {}
116
116
  for split in self.metadata.eval_splits:
117
- labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
117
+ labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
118
118
  sentences = list(
119
- itertools.chain.from_iterable(self.dataset[split]["sentences"])
119
+ itertools.chain.from_iterable(self.dataset[split]['sentences'])
120
120
  )
121
121
 
122
122
  check_label_distribution(self.dataset[split])
123
123
 
124
- ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
124
+ ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
125
125
  self.dataset = DatasetDict(ds)
126
126
  self.dataset = self.stratified_subsampling(
127
127
  self.dataset,
128
128
  self.seed,
129
129
  self.metadata.eval_splits,
130
- label="labels",
130
+ label='labels',
131
131
  n_samples=NUM_SAMPLES,
132
132
  )
133
133
 
@@ -137,26 +137,26 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
137
137
  max_fraction_of_documents_to_embed = None
138
138
 
139
139
  metadata = TaskMetadata(
140
- name="ThuNewsClusteringS2S",
140
+ name='ThuNewsClusteringS2S',
141
141
  dataset={
142
- "path": "C-MTEB/ThuNewsClusteringS2S",
143
- "revision": "8a8b2caeda43f39e13c4bc5bea0f8a667896e10d",
142
+ 'path': 'C-MTEB/ThuNewsClusteringS2S',
143
+ 'revision': '8a8b2caeda43f39e13c4bc5bea0f8a667896e10d',
144
144
  },
145
- description="Clustering of titles from the THUCNews dataset",
146
- reference="http://thuctc.thunlp.org/",
147
- type="Clustering",
148
- category="s2s",
149
- modalities=["text"],
150
- eval_splits=["test"],
151
- eval_langs=["cmn-Hans"],
152
- main_score="v_measure",
153
- date=("2006-01-01", "2007-01-01"),
154
- domains=["News", "Written"],
155
- task_subtypes=["Thematic clustering", "Topic classification"],
156
- license="Not specified",
157
- annotations_creators="derived",
145
+ description='Clustering of titles from the THUCNews dataset',
146
+ reference='http://thuctc.thunlp.org/',
147
+ type='Clustering',
148
+ category='s2s',
149
+ modalities=['text'],
150
+ eval_splits=['test'],
151
+ eval_langs=['cmn-Hans'],
152
+ main_score='v_measure',
153
+ date=('2006-01-01', '2007-01-01'),
154
+ domains=['News', 'Written'],
155
+ task_subtypes=['Thematic clustering', 'Topic classification'],
156
+ license='apache-2.0',
157
+ annotations_creators='derived',
158
158
  dialect=[],
159
- sample_creation="found",
159
+ sample_creation='found',
160
160
  bibtex_citation="""@software{THUCTC,
161
161
  author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
162
162
  title = {THUCTC: An Efficient Chinese Text Classifier},
@@ -166,28 +166,28 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
166
166
  url = {https://github.com/thunlp/THUCTC}
167
167
  }""",
168
168
  descriptive_stats={
169
- "n_samples": {"test": NUM_SAMPLES},
170
- "avg_character_length": {},
169
+ 'n_samples': {'test': NUM_SAMPLES},
170
+ 'avg_character_length': {},
171
171
  },
172
172
  )
173
173
 
174
174
  def dataset_transform(self):
175
175
  ds = {}
176
176
  for split in self.metadata.eval_splits:
177
- labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
177
+ labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
178
178
  sentences = list(
179
- itertools.chain.from_iterable(self.dataset[split]["sentences"])
179
+ itertools.chain.from_iterable(self.dataset[split]['sentences'])
180
180
  )
181
181
 
182
182
  check_label_distribution(self.dataset[split])
183
183
 
184
- ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
184
+ ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
185
185
  self.dataset = DatasetDict(ds)
186
186
  self.dataset = self.stratified_subsampling(
187
187
  self.dataset,
188
188
  self.seed,
189
189
  self.metadata.eval_splits,
190
- label="labels",
190
+ label='labels',
191
191
  n_samples=NUM_SAMPLES,
192
192
  )
193
193
 
@@ -197,26 +197,26 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
197
197
  max_fraction_of_documents_to_embed = None
198
198
 
199
199
  metadata = TaskMetadata(
200
- name="ThuNewsClusteringP2P",
200
+ name='ThuNewsClusteringP2P',
201
201
  dataset={
202
- "path": "C-MTEB/ThuNewsClusteringP2P",
203
- "revision": "5798586b105c0434e4f0fe5e767abe619442cf93",
202
+ 'path': 'C-MTEB/ThuNewsClusteringP2P',
203
+ 'revision': '5798586b105c0434e4f0fe5e767abe619442cf93',
204
204
  },
205
- description="Clustering of titles + abstracts from the THUCNews dataset",
206
- reference="http://thuctc.thunlp.org/",
207
- type="Clustering",
208
- category="p2p",
209
- modalities=["text"],
210
- eval_splits=["test"],
211
- eval_langs=["cmn-Hans"],
212
- main_score="v_measure",
213
- date=("2006-01-01", "2007-01-01"),
214
- domains=["News", "Written"],
215
- task_subtypes=["Thematic clustering", "Topic classification"],
216
- license="Not specified",
217
- annotations_creators="derived",
205
+ description='Clustering of titles + abstracts from the THUCNews dataset',
206
+ reference='http://thuctc.thunlp.org/',
207
+ type='Clustering',
208
+ category='p2p',
209
+ modalities=['text'],
210
+ eval_splits=['test'],
211
+ eval_langs=['cmn-Hans'],
212
+ main_score='v_measure',
213
+ date=('2006-01-01', '2007-01-01'),
214
+ domains=['News', 'Written'],
215
+ task_subtypes=['Thematic clustering', 'Topic classification'],
216
+ license='apache-2.0',
217
+ annotations_creators='derived',
218
218
  dialect=[],
219
- sample_creation="found",
219
+ sample_creation='found',
220
220
  bibtex_citation="""@software{THUCTC,
221
221
  author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
222
222
  title = {THUCTC: An Efficient Chinese Text Classifier},
@@ -226,27 +226,27 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
226
226
  url = {https://github.com/thunlp/THUCTC}
227
227
  }""",
228
228
  descriptive_stats={
229
- "n_samples": {"test": NUM_SAMPLES},
230
- "avg_character_length": {},
229
+ 'n_samples': {'test': NUM_SAMPLES},
230
+ 'avg_character_length': {},
231
231
  },
232
232
  )
233
233
 
234
234
  def dataset_transform(self):
235
235
  ds = {}
236
236
  for split in self.metadata.eval_splits:
237
- labels = list(itertools.chain.from_iterable(self.dataset[split]["labels"]))
237
+ labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
238
238
  sentences = list(
239
- itertools.chain.from_iterable(self.dataset[split]["sentences"])
239
+ itertools.chain.from_iterable(self.dataset[split]['sentences'])
240
240
  )
241
241
 
242
242
  check_label_distribution(self.dataset[split])
243
243
 
244
- ds[split] = Dataset.from_dict({"labels": labels, "sentences": sentences})
244
+ ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
245
245
  self.dataset = DatasetDict(ds)
246
246
  self.dataset = self.stratified_subsampling(
247
247
  self.dataset,
248
248
  self.seed,
249
249
  self.metadata.eval_splits,
250
- label="labels",
250
+ label='labels',
251
251
  n_samples=NUM_SAMPLES,
252
252
  )