opencompass 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. {opencompass-0.2.3 → opencompass-0.2.5}/PKG-INFO +68 -21
  2. {opencompass-0.2.3 → opencompass-0.2.5}/README.md +67 -20
  3. opencompass-0.2.5/opencompass/__init__.py +1 -0
  4. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/GaokaoBench.py +23 -6
  5. opencompass-0.2.5/opencompass/datasets/MMLUArabic.py +33 -0
  6. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_GCP_D.py +5 -1
  7. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_TSP_D.py +5 -1
  8. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_SPP.py +5 -1
  9. opencompass-0.2.5/opencompass/datasets/QuALITY.py +59 -0
  10. opencompass-0.2.5/opencompass/datasets/TheoremQA/__init__.py +4 -0
  11. opencompass-0.2.3/opencompass/datasets/TheoremQA.py → opencompass-0.2.5/opencompass/datasets/TheoremQA/legacy.py +1 -1
  12. opencompass-0.2.5/opencompass/datasets/TheoremQA/main.py +66 -0
  13. opencompass-0.2.5/opencompass/datasets/TheoremQA/number_utils.py +98 -0
  14. opencompass-0.2.5/opencompass/datasets/TheoremQA/utils.py +110 -0
  15. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/__init__.py +10 -0
  16. opencompass-0.2.5/opencompass/datasets/apps.py +877 -0
  17. opencompass-0.2.5/opencompass/datasets/charm.py +55 -0
  18. opencompass-0.2.5/opencompass/datasets/chembench.py +34 -0
  19. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cibench.py +178 -149
  20. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/custom.py +10 -0
  21. opencompass-0.2.5/opencompass/datasets/drop_simple_eval.py +80 -0
  22. opencompass-0.2.5/opencompass/datasets/flames.py +57 -0
  23. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/gpqa.py +53 -1
  24. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/humanevalx.py +4 -1
  25. opencompass-0.2.5/opencompass/datasets/llm_compression.py +36 -0
  26. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/math.py +34 -6
  27. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/mathbench.py +1 -1
  28. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/mbpp.py +53 -44
  29. opencompass-0.2.5/opencompass/datasets/mgsm.py +78 -0
  30. opencompass-0.2.5/opencompass/datasets/s3eval.py +169 -0
  31. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/__init__.py +2 -0
  32. opencompass-0.2.5/opencompass/datasets/subjective/arena_hard.py +35 -0
  33. opencompass-0.2.5/opencompass/datasets/subjective/compassbench.py +101 -0
  34. opencompass-0.2.5/opencompass/datasets/taco.py +824 -0
  35. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/winogrande.py +9 -0
  36. opencompass-0.2.5/opencompass/models/__init__.py +47 -0
  37. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/ai360_api.py +27 -25
  38. opencompass-0.2.5/opencompass/models/baichuan_api.py +283 -0
  39. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/baidu_api.py +30 -13
  40. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/base.py +2 -2
  41. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/base_api.py +4 -4
  42. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/bytedance_api.py +4 -4
  43. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/claude_api/claude_api.py +4 -4
  44. opencompass-0.2.3/opencompass/models/minimax_api.py → opencompass-0.2.5/opencompass/models/deepseek_api.py +66 -70
  45. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/gemini_api.py +4 -67
  46. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/huggingface.py +10 -3
  47. opencompass-0.2.5/opencompass/models/huggingface_above_v4_33.py +440 -0
  48. opencompass-0.2.5/opencompass/models/hunyuan_api.py +121 -0
  49. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/lagent.py +4 -3
  50. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/lightllm_api.py +169 -4
  51. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/llama2.py +1 -1
  52. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/lmdeploy_pytorch.py +14 -5
  53. opencompass-0.2.5/opencompass/models/lmdeploy_tis.py +200 -0
  54. opencompass-0.2.5/opencompass/models/minimax_api.py +352 -0
  55. opencompass-0.2.5/opencompass/models/mistral_api.py +123 -0
  56. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/moonshot_api.py +24 -26
  57. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/nanbeige_api.py +4 -4
  58. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/openai_api.py +44 -147
  59. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/pangu_api.py +4 -4
  60. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/qwen_api.py +27 -14
  61. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/sensetime_api.py +14 -9
  62. opencompass-0.2.5/opencompass/models/stepfun_api.py +182 -0
  63. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/turbomind.py +59 -14
  64. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/turbomind_api.py +4 -4
  65. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/turbomind_tis.py +10 -4
  66. opencompass-0.2.5/opencompass/models/turbomind_with_tf_above_v4_33.py +195 -0
  67. opencompass-0.2.5/opencompass/models/unigpt_api.py +147 -0
  68. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/vllm.py +38 -20
  69. opencompass-0.2.5/opencompass/models/vllm_with_tf_above_v4_33.py +127 -0
  70. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/xunfei_api.py +153 -4
  71. opencompass-0.2.5/opencompass/models/yayi_api.py +261 -0
  72. opencompass-0.2.3/opencompass/models/baichuan_api.py → opencompass-0.2.5/opencompass/models/yi_api.py +67 -48
  73. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/zhipuai_api.py +4 -4
  74. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/zhipuai_v2_api.py +12 -6
  75. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/__init__.py +1 -0
  76. opencompass-0.2.5/opencompass/openicl/icl_evaluator/icl_bpc_evaluator.py +32 -0
  77. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +26 -0
  78. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/lm_evaluator.py +65 -28
  79. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/__init__.py +1 -0
  80. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +2 -0
  81. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +2 -14
  82. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py +27 -49
  83. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py +35 -67
  84. opencompass-0.2.5/opencompass/openicl/icl_inferencer/icl_sw_ce_loss_inferencer.py +352 -0
  85. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_prompt_template.py +4 -4
  86. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/__init__.py +0 -1
  87. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/base.py +18 -7
  88. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/num_worker.py +5 -3
  89. opencompass-0.2.5/opencompass/partitioners/sub_naive.py +220 -0
  90. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/sub_size.py +29 -6
  91. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/registry.py +15 -9
  92. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/base.py +2 -1
  93. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/dlc.py +57 -22
  94. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/local.py +18 -2
  95. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/__init__.py +2 -0
  96. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/default.py +10 -8
  97. opencompass-0.2.5/opencompass/summarizers/llm_compression.py +200 -0
  98. opencompass-0.2.5/opencompass/summarizers/multi_faceted.py +46 -0
  99. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/needlebench.py +234 -173
  100. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/__init__.py +4 -0
  101. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/alignmentbench.py +43 -33
  102. opencompass-0.2.3/opencompass/summarizers/subjective/mtbench.py → opencompass-0.2.5/opencompass/summarizers/subjective/all_obj.py +31 -54
  103. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/alpacaeval.py +2 -1
  104. opencompass-0.2.5/opencompass/summarizers/subjective/arenahard.py +309 -0
  105. opencompass-0.2.5/opencompass/summarizers/subjective/compass_arena.py +240 -0
  106. opencompass-0.2.5/opencompass/summarizers/subjective/compassbench.py +241 -0
  107. opencompass-0.2.5/opencompass/summarizers/subjective/flames.py +93 -0
  108. opencompass-0.2.5/opencompass/summarizers/subjective/mtbench.py +153 -0
  109. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/multiround.py +2 -1
  110. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/__init__.py +0 -1
  111. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/openicl_eval.py +9 -4
  112. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/openicl_infer.py +16 -7
  113. opencompass-0.2.5/opencompass/tasks/subjective_eval.py +443 -0
  114. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/abbr.py +22 -0
  115. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/file.py +3 -3
  116. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/prompt.py +14 -7
  117. opencompass-0.2.5/opencompass/utils/run.py +350 -0
  118. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/text_postprocessors.py +21 -15
  119. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/PKG-INFO +68 -21
  120. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/SOURCES.txt +38 -3
  121. opencompass-0.2.5/opencompass.egg-info/entry_points.txt +3 -0
  122. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/requires.txt +4 -1
  123. {opencompass-0.2.3 → opencompass-0.2.5}/setup.py +40 -33
  124. opencompass-0.2.3/opencompass/__init__.py +0 -1
  125. opencompass-0.2.3/opencompass/models/__init__.py +0 -34
  126. opencompass-0.2.3/opencompass/partitioners/mm_naive.py +0 -119
  127. opencompass-0.2.3/opencompass/partitioners/sub_naive.py +0 -110
  128. opencompass-0.2.3/opencompass/summarizers/subjective/compass_arena.py +0 -204
  129. opencompass-0.2.3/opencompass/tasks/mm_infer.py +0 -160
  130. opencompass-0.2.3/opencompass/tasks/subjective_eval.py +0 -282
  131. opencompass-0.2.3/opencompass/utils/run.py +0 -212
  132. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/FinanceIQ.py +0 -0
  133. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/__init__.py +0 -0
  134. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_KSP.py +0 -0
  135. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_GCP.py +0 -0
  136. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_MSP.py +0 -0
  137. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_TSP.py +0 -0
  138. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_BSP.py +0 -0
  139. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_EDP.py +0 -0
  140. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/prompts.py +0 -0
  141. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/utils.py +0 -0
  142. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/OpenFinData.py +0 -0
  143. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/advglue.py +0 -0
  144. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/afqmcd.py +0 -0
  145. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/__init__.py +0 -0
  146. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/agieval.py +0 -0
  147. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/constructions.py +0 -0
  148. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/dataset_loader.py +0 -0
  149. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/evaluation.py +0 -0
  150. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/math_equivalence.py +0 -0
  151. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/post_process.py +0 -0
  152. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/utils.py +0 -0
  153. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/anli.py +0 -0
  154. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/anthropics_evals.py +0 -0
  155. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/arc.py +0 -0
  156. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/ax.py +0 -0
  157. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/base.py +0 -0
  158. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/bbh.py +0 -0
  159. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/boolq.py +0 -0
  160. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/bustum.py +0 -0
  161. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/c3.py +0 -0
  162. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cb.py +0 -0
  163. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/ceval.py +0 -0
  164. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/chid.py +0 -0
  165. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/circular.py +0 -0
  166. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/civilcomments.py +0 -0
  167. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/clozeTest_maxmin.py +0 -0
  168. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cluewsc.py +0 -0
  169. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cmb.py +1 -1
  170. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cmmlu.py +0 -0
  171. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cmnli.py +0 -0
  172. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cmrc.py +0 -0
  173. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/commonsenseqa.py +0 -0
  174. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/commonsenseqa_cn.py +0 -0
  175. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/copa.py +0 -0
  176. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/crowspairs.py +0 -0
  177. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/crowspairs_cn.py +0 -0
  178. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/csl.py +0 -0
  179. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cvalues.py +0 -0
  180. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/drcd.py +0 -0
  181. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/drop.py +0 -0
  182. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/ds1000.py +0 -0
  183. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/ds1000_interpreter.py +0 -0
  184. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/eprstmt.py +0 -0
  185. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/flores.py +0 -0
  186. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/game24.py +0 -0
  187. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/govrepcrs.py +0 -0
  188. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/gsm8k.py +0 -0
  189. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/gsm_hard.py +0 -0
  190. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/hellaswag.py +0 -0
  191. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/huggingface.py +0 -0
  192. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/humaneval.py +0 -0
  193. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/humaneval_multi.py +0 -0
  194. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/hungarian_math.py +0 -0
  195. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/__init__.py +0 -0
  196. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_codedebug.py +0 -0
  197. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_coderun.py +0 -0
  198. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_endia.py +0 -0
  199. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_enmc.py +0 -0
  200. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_enqa.py +0 -0
  201. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_ensum.py +0 -0
  202. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_mathcalc.py +0 -0
  203. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_mathfind.py +0 -0
  204. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievekv.py +0 -0
  205. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py +0 -0
  206. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py +0 -0
  207. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_zhqa.py +0 -0
  208. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/utils.py +0 -0
  209. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/iwslt2017.py +0 -0
  210. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/jigsawmultilingual.py +0 -0
  211. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/jsonl.py +0 -0
  212. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/kaoshi.py +0 -0
  213. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lambada.py +0 -0
  214. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lawbench/__init__.py +0 -0
  215. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lawbench/lawbench.py +0 -0
  216. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lcsts.py +0 -0
  217. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/__init__.py +0 -0
  218. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/evaluators.py +0 -0
  219. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_coursera.py +0 -0
  220. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_financial_qa.py +0 -0
  221. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_gov_report_summ.py +0 -0
  222. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_gsm100.py +0 -0
  223. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_legal_contract_qa.py +0 -0
  224. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_meeting_summ.py +0 -0
  225. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_multidoc_qa.py +0 -0
  226. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_narrattive_qa.py +0 -0
  227. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_natural_question.py +0 -0
  228. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_news_summ.py +0 -0
  229. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_paper_assistant.py +0 -0
  230. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_patent_summ.py +0 -0
  231. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_quality.py +0 -0
  232. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_review_summ.py +0 -0
  233. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_scientific_qa.py +0 -0
  234. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_topic_retrieval.py +0 -0
  235. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_tpo.py +0 -0
  236. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_tvshow_summ.py +0 -0
  237. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lmeval.py +0 -0
  238. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/__init__.py +0 -0
  239. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/evaluators.py +0 -0
  240. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_2wikim_qa.py +0 -0
  241. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_dureader.py +0 -0
  242. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_gov_report.py +0 -0
  243. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_hotpot_qa.py +0 -0
  244. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_lcc.py +0 -0
  245. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_lsht.py +0 -0
  246. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multi_news.py +0 -0
  247. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multifieldqa_en.py +0 -0
  248. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multifieldqa_zh.py +0 -0
  249. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_musique.py +0 -0
  250. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_narrative_qa.py +0 -0
  251. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_count.py +0 -0
  252. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_retrieval_en.py +0 -0
  253. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py +0 -0
  254. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_qasper.py +0 -0
  255. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_qmsum.py +0 -0
  256. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_repobench.py +0 -0
  257. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_samsum.py +0 -0
  258. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_trec.py +0 -0
  259. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_trivia_qa.py +0 -0
  260. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_vcsum.py +0 -0
  261. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/__init__.py +0 -0
  262. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/evaluators.py +0 -0
  263. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_cmrc_mixup.py +0 -0
  264. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_dureader_mixup.py +0 -0
  265. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_factrecall_en.py +0 -0
  266. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_factrecall_zh.py +0 -0
  267. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py +0 -0
  268. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_lic_mixup.py +0 -0
  269. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py +0 -0
  270. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py +0 -0
  271. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py +0 -0
  272. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py +0 -0
  273. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py +0 -0
  274. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/mastermath2024v1.py +0 -0
  275. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/math401.py +0 -0
  276. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/math_intern.py +0 -0
  277. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/__init__.py +0 -0
  278. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/constructions.py +0 -0
  279. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/dataset_loader.py +0 -0
  280. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/evaluation.py +0 -0
  281. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/math_equivalence.py +0 -0
  282. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/medbench.py +0 -0
  283. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/post_process.py +0 -0
  284. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/utils.py +0 -0
  285. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/mmlu.py +0 -0
  286. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/multirc.py +0 -0
  287. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/narrativeqa.py +0 -0
  288. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/natural_question.py +0 -0
  289. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/natural_question_cn.py +0 -0
  290. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/obqa.py +0 -0
  291. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/piqa.py +0 -0
  292. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/py150.py +0 -0
  293. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/qasper.py +0 -0
  294. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/qaspercut.py +0 -0
  295. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/race.py +0 -0
  296. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/realtoxicprompts.py +0 -0
  297. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/reasonbench/ReasonBenchDataset.py +0 -0
  298. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/reasonbench/__init__.py +0 -0
  299. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/record.py +0 -0
  300. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/rolebench.py +0 -0
  301. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/safety.py +0 -0
  302. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/scibench.py +0 -0
  303. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/siqa.py +0 -0
  304. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/squad20.py +0 -0
  305. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/storycloze.py +0 -0
  306. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/strategyqa.py +0 -0
  307. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/alignbench.py +0 -0
  308. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/compass_arena.py +0 -0
  309. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/corev2.py +0 -0
  310. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/creationbench.py +0 -0
  311. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/information_retrival.py +0 -0
  312. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/mtbench.py +0 -0
  313. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/multiround.py +0 -0
  314. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/subjective_cmp.py +0 -0
  315. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/summedits.py +0 -0
  316. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/summscreen.py +0 -0
  317. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/svamp.py +0 -0
  318. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/tabmwp.py +0 -0
  319. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/__init__.py +0 -0
  320. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/__init__.py +0 -0
  321. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/instruct_evaluator.py +0 -0
  322. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/planning_evaluator.py +0 -0
  323. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py +0 -0
  324. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/review_evaluator.py +0 -0
  325. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/schema.py +0 -0
  326. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/__init__.py +0 -0
  327. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/convert_results.py +0 -0
  328. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/format_load.py +0 -0
  329. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/meta_template.py +0 -0
  330. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/template.py +0 -0
  331. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/tnews.py +0 -0
  332. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/triviaqa.py +0 -0
  333. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/triviaqarc.py +0 -0
  334. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/truthfulqa.py +0 -0
  335. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/tydiqa.py +0 -0
  336. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/wic.py +0 -0
  337. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/wikibench.py +0 -0
  338. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/winograd.py +0 -0
  339. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/wnli.py +0 -0
  340. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/wsc.py +0 -0
  341. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/xcopa.py +0 -0
  342. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/xiezhi.py +0 -0
  343. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/xlsum.py +0 -0
  344. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/xsum.py +0 -0
  345. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/metrics/__init__.py +0 -0
  346. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/metrics/dump_results.py +0 -0
  347. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/metrics/mme_score.py +0 -0
  348. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/metrics/seedbench.py +0 -0
  349. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/accessory.py +0 -0
  350. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/alaya.py +0 -0
  351. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/claude_api/__init__.py +0 -0
  352. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/claude_api/postprocessors.py +0 -0
  353. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/glm.py +0 -0
  354. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/intern_model.py +0 -0
  355. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/krgpt_api.py +0 -0
  356. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/langchain.py +0 -0
  357. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/mixtral.py +0 -0
  358. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/modelscope.py +0 -0
  359. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/__init__.py +0 -0
  360. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_dataset_reader.py +0 -0
  361. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_agent_evaluator.py +0 -0
  362. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py +0 -0
  363. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +0 -0
  364. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_circular_evaluator.py +0 -0
  365. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_em_evaluator.py +0 -0
  366. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_jieba_rouge_evaluator.py +0 -0
  367. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py +0 -0
  368. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py +0 -0
  369. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +0 -0
  370. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py +0 -0
  371. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_attack_inferencer.py +0 -0
  372. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py +0 -0
  373. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py +0 -0
  374. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_mink_percent_inferencer.py +0 -0
  375. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py +0 -0
  376. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_sc_inferencer.py +0 -0
  377. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_tot_inferencer.py +0 -0
  378. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/__init__.py +0 -0
  379. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_base_retriever.py +0 -0
  380. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_bm25_retriever.py +0 -0
  381. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_dpp_retriever.py +0 -0
  382. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_fix_k_retriever.py +0 -0
  383. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_mdl_retriever.py +0 -0
  384. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_random_retriever.py +0 -0
  385. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_topk_retriever.py +0 -0
  386. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_votek_retriever.py +0 -0
  387. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_zero_retriever.py +0 -0
  388. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/utils/__init__.py +0 -0
  389. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/utils/logging.py +0 -0
  390. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/naive.py +0 -0
  391. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/size.py +0 -0
  392. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/__init__.py +0 -0
  393. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/local_api.py +0 -0
  394. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/slurm.py +0 -0
  395. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/slurm_sequential.py +0 -0
  396. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/circular.py +0 -0
  397. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/multi_model.py +0 -0
  398. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/corev2.py +0 -0
  399. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/creationbench.py +0 -0
  400. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/information_retrival.py +0 -0
  401. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/subjective_post_process.py +0 -0
  402. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/utils.py +0 -0
  403. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/summarizer_pretrain.py +0 -0
  404. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/base.py +0 -0
  405. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/llm_eval.py +0 -0
  406. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/openicl_attack.py +0 -0
  407. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/__init__.py +0 -0
  408. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/auxiliary.py +0 -0
  409. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/build.py +0 -0
  410. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/collect_env.py +0 -0
  411. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/dependency.py +0 -0
  412. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/fileio.py +0 -0
  413. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/lark.py +0 -0
  414. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/logging.py +0 -0
  415. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/menu.py +0 -0
  416. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/types.py +0 -0
  417. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/dependency_links.txt +0 -0
  418. {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/top_level.txt +0 -0
  419. {opencompass-0.2.3 → opencompass-0.2.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opencompass
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: A comprehensive toolkit for large model evaluation
5
5
  Home-page: https://github.com/open-compass/opencompass
6
6
  Author: OpenCompass Contributors
@@ -11,8 +11,13 @@ Description: <div align="center">
11
11
  <br />
12
12
  <br />
13
13
 
14
- [![docs](https://readthedocs.org/projects/opencompass/badge)](https://opencompass.readthedocs.io/en)
15
- [![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](https://github.com/open-compass/opencompass/blob/main/LICENSE)
14
+ [![][github-release-shield]][github-release-link]
15
+ [![][github-releasedate-shield]][github-releasedate-link]
16
+ [![][github-contributors-shield]][github-contributors-link]<br>
17
+ [![][github-forks-shield]][github-forks-link]
18
+ [![][github-stars-shield]][github-stars-link]
19
+ [![][github-issues-shield]][github-issues-link]
20
+ [![][github-license-shield]][github-license-link]
16
21
 
17
22
  <!-- [![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/) -->
18
23
 
@@ -25,12 +30,18 @@ Description: <div align="center">
25
30
 
26
31
  English | [简体中文](README_zh-CN.md)
27
32
 
33
+ [![][github-trending-shield]][github-trending-url]
34
+
28
35
  </div>
29
36
 
30
37
  <p align="center">
31
38
  👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
32
39
  </p>
33
40
 
41
+ > \[!IMPORTANT\]
42
+ >
43
+ > **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
44
+
34
45
  ## 📣 OpenCompass 2.0
35
46
 
36
47
  We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
@@ -42,6 +53,14 @@ Description: <div align="center">
42
53
 
43
54
  **CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
44
55
 
56
+ <details>
57
+ <summary><kbd>Star History</kbd></summary>
58
+ <picture>
59
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
60
+ <img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
61
+ </picture>
62
+ </details>
63
+
45
64
  ## 🧭 Welcome
46
65
 
47
66
  to **OpenCompass**!
@@ -59,12 +78,14 @@ Description: <div align="center">
59
78
 
60
79
  ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
61
80
 
62
- - **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html) 🔥🔥🔥.
63
- - **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information ! 🔥🔥🔥.
64
- - **\[2024.01.17\]** We supported the evaluation of [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_keyset.py) and [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py), InternLM2 showed extremely strong performance in these tests, welcome to try! 🔥🔥🔥.
65
- - **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8) 🔥🔥🔥.
66
- - **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development.
67
- - **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details!
81
+ - **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
82
+ - **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
83
+ - **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
84
+ - **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
85
+ - **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥.
86
+ - **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
87
+ - **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
88
+ - **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
68
89
 
69
90
  > [More](docs/en/notes/news.md)
70
91
 
@@ -114,7 +135,7 @@ Description: <div align="center">
114
135
  git clone https://github.com/open-compass/opencompass opencompass
115
136
  cd opencompass
116
137
  pip install -e .
117
- # also please install requiresments packages via `pip install -r requirements/api.txt` for API models if needed.
138
+ # also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
118
139
  ```
119
140
 
120
141
  ### 📂 Data Preparation
@@ -149,19 +170,13 @@ Description: <div align="center">
149
170
  You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
150
171
 
151
172
  ```bash
152
- python run.py --datasets ceval_ppl mmlu_ppl \
153
- --hf-path huggyllama/llama-7b \ # HuggingFace model path
154
- --model-kwargs device_map='auto' \ # Arguments for model construction
155
- --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
156
- --max-out-len 100 \ # Maximum number of tokens generated
157
- --max-seq-len 2048 \ # Maximum sequence length the model can accept
158
- --batch-size 8 \ # Batch size
159
- --no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
160
- --num-gpus 1 # Number of minimum required GPUs
173
+ python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
161
174
  ```
162
175
 
163
- > **Note**<br />
164
- > To run the command above, you will need to remove the comments starting from `# ` first.
176
+ > \[!TIP\]
177
+ >
178
+ > configuration with `_ppl` is designed for base model typically.
179
+ > configuration with `_gen` can be used for both base model and chat model.
165
180
 
166
181
  Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
167
182
 
@@ -447,6 +462,7 @@ Description: <div align="center">
447
462
 
448
463
  - [InternLM](https://github.com/InternLM/InternLM)
449
464
  - [LLaMA](https://github.com/facebookresearch/llama)
465
+ - [LLaMA3](https://github.com/meta-llama/llama3)
450
466
  - [Vicuna](https://github.com/lm-sys/FastChat)
451
467
  - [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
452
468
  - [Baichuan](https://github.com/baichuan-inc)
@@ -505,6 +521,20 @@ Description: <div align="center">
505
521
 
506
522
  We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
507
523
 
524
+ <!-- Copy-paste in your Readme.md file -->
525
+
526
+ <!-- Made with [OSS Insight](https://ossinsight.io/) -->
527
+
528
+ <a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
529
+ <table>
530
+ <tr>
531
+ <th colspan="2">
532
+ <br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
533
+ </th>
534
+ </tr>
535
+ </table>
536
+ </a>
537
+
508
538
  ## 🤝 Acknowledgements
509
539
 
510
540
  Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
@@ -524,6 +554,23 @@ Description: <div align="center">
524
554
 
525
555
  <p align="right"><a href="#top">🔝Back to top</a></p>
526
556
 
557
+ [github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
558
+ [github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
559
+ [github-forks-link]: https://github.com/open-compass/opencompass/network/members
560
+ [github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
561
+ [github-issues-link]: https://github.com/open-compass/opencompass/issues
562
+ [github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
563
+ [github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
564
+ [github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
565
+ [github-release-link]: https://github.com/open-compass/opencompass/releases
566
+ [github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
567
+ [github-releasedate-link]: https://github.com/open-compass/opencompass/releases
568
+ [github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
569
+ [github-stars-link]: https://github.com/open-compass/opencompass/stargazers
570
+ [github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
571
+ [github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
572
+ [github-trending-url]: https://trendshift.io/repositories/6630
573
+
527
574
  Keywords: AI,NLP,in-context learning,large language model,evaluation,benchmark,llm
528
575
  Platform: UNKNOWN
529
576
  Classifier: Programming Language :: Python :: 3.8
@@ -3,8 +3,13 @@
3
3
  <br />
4
4
  <br />
5
5
 
6
- [![docs](https://readthedocs.org/projects/opencompass/badge)](https://opencompass.readthedocs.io/en)
7
- [![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](https://github.com/open-compass/opencompass/blob/main/LICENSE)
6
+ [![][github-release-shield]][github-release-link]
7
+ [![][github-releasedate-shield]][github-releasedate-link]
8
+ [![][github-contributors-shield]][github-contributors-link]<br>
9
+ [![][github-forks-shield]][github-forks-link]
10
+ [![][github-stars-shield]][github-stars-link]
11
+ [![][github-issues-shield]][github-issues-link]
12
+ [![][github-license-shield]][github-license-link]
8
13
 
9
14
  <!-- [![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/) -->
10
15
 
@@ -17,12 +22,18 @@
17
22
 
18
23
  English | [简体中文](README_zh-CN.md)
19
24
 
25
+ [![][github-trending-shield]][github-trending-url]
26
+
20
27
  </div>
21
28
 
22
29
  <p align="center">
23
30
  👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
24
31
  </p>
25
32
 
33
+ > \[!IMPORTANT\]
34
+ >
35
+ > **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
36
+
26
37
  ## 📣 OpenCompass 2.0
27
38
 
28
39
  We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
@@ -34,6 +45,14 @@ We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three
34
45
 
35
46
  **CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
36
47
 
48
+ <details>
49
+ <summary><kbd>Star History</kbd></summary>
50
+ <picture>
51
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
52
+ <img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
53
+ </picture>
54
+ </details>
55
+
37
56
  ## 🧭 Welcome
38
57
 
39
58
  to **OpenCompass**!
@@ -51,12 +70,14 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
51
70
 
52
71
  ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
53
72
 
54
- - **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html) 🔥🔥🔥.
55
- - **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information ! 🔥🔥🔥.
56
- - **\[2024.01.17\]** We supported the evaluation of [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_keyset.py) and [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py), InternLM2 showed extremely strong performance in these tests, welcome to try! 🔥🔥🔥.
57
- - **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8) 🔥🔥🔥.
58
- - **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development.
59
- - **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details!
73
+ - **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
74
+ - **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
75
+ - **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
76
+ - **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
77
+ - **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥.
78
+ - **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
79
+ - **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
80
+ - **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
60
81
 
61
82
  > [More](docs/en/notes/news.md)
62
83
 
@@ -106,7 +127,7 @@ conda activate opencompass
106
127
  git clone https://github.com/open-compass/opencompass opencompass
107
128
  cd opencompass
108
129
  pip install -e .
109
- # also please install requiresments packages via `pip install -r requirements/api.txt` for API models if needed.
130
+ # also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
110
131
  ```
111
132
 
112
133
  ### 📂 Data Preparation
@@ -141,19 +162,13 @@ python tools/list_configs.py llama mmlu
141
162
  You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
142
163
 
143
164
  ```bash
144
- python run.py --datasets ceval_ppl mmlu_ppl \
145
- --hf-path huggyllama/llama-7b \ # HuggingFace model path
146
- --model-kwargs device_map='auto' \ # Arguments for model construction
147
- --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
148
- --max-out-len 100 \ # Maximum number of tokens generated
149
- --max-seq-len 2048 \ # Maximum sequence length the model can accept
150
- --batch-size 8 \ # Batch size
151
- --no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
152
- --num-gpus 1 # Number of minimum required GPUs
165
+ python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
153
166
  ```
154
167
 
155
- > **Note**<br />
156
- > To run the command above, you will need to remove the comments starting from `# ` first.
168
+ > \[!TIP\]
169
+ >
170
+ > configuration with `_ppl` is designed for base model typically.
171
+ > configuration with `_gen` can be used for both base model and chat model.
157
172
 
158
173
  Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
159
174
 
@@ -439,6 +454,7 @@ Through the command line or configuration files, OpenCompass also supports evalu
439
454
 
440
455
  - [InternLM](https://github.com/InternLM/InternLM)
441
456
  - [LLaMA](https://github.com/facebookresearch/llama)
457
+ - [LLaMA3](https://github.com/meta-llama/llama3)
442
458
  - [Vicuna](https://github.com/lm-sys/FastChat)
443
459
  - [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
444
460
  - [Baichuan](https://github.com/baichuan-inc)
@@ -497,6 +513,20 @@ Through the command line or configuration files, OpenCompass also supports evalu
497
513
 
498
514
  We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
499
515
 
516
+ <!-- Copy-paste in your Readme.md file -->
517
+
518
+ <!-- Made with [OSS Insight](https://ossinsight.io/) -->
519
+
520
+ <a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
521
+ <table>
522
+ <tr>
523
+ <th colspan="2">
524
+ <br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
525
+ </th>
526
+ </tr>
527
+ </table>
528
+ </a>
529
+
500
530
  ## 🤝 Acknowledgements
501
531
 
502
532
  Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
@@ -515,3 +545,20 @@ Some datasets and prompt implementations are modified from [chain-of-thought-hub
515
545
  ```
516
546
 
517
547
  <p align="right"><a href="#top">🔝Back to top</a></p>
548
+
549
+ [github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
550
+ [github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
551
+ [github-forks-link]: https://github.com/open-compass/opencompass/network/members
552
+ [github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
553
+ [github-issues-link]: https://github.com/open-compass/opencompass/issues
554
+ [github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
555
+ [github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
556
+ [github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
557
+ [github-release-link]: https://github.com/open-compass/opencompass/releases
558
+ [github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
559
+ [github-releasedate-link]: https://github.com/open-compass/opencompass/releases
560
+ [github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
561
+ [github-stars-link]: https://github.com/open-compass/opencompass/stargazers
562
+ [github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
563
+ [github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
564
+ [github-trending-url]: https://trendshift.io/repositories/6630
@@ -0,0 +1 @@
1
+ __version__ = '0.2.5'
@@ -91,34 +91,51 @@ class GaokaoBenchEvaluator(BaseEvaluator):
91
91
  ]:
92
92
  return {'score': 0}
93
93
  elif self.question_type == 'multi_choice':
94
+ details = {}
94
95
  correct_score, total_score = 0, 0
95
- for pred, refr in zip(predictions, references):
96
+ for index, (pred, refr) in enumerate(zip(predictions, references)):
96
97
  pred = self.do_predictions_postprocess(pred)
97
98
  pred = self.ensure_same_length(pred, refr)
99
+ is_corrects = []
98
100
  for p, r in zip(pred, refr):
99
101
  if p == r:
100
102
  correct_score += 2
103
+ is_corrects.append(True)
101
104
  else:
102
105
  for i in p:
103
106
  if i not in r:
104
107
  break
105
108
  else:
106
109
  correct_score += 1
110
+ is_corrects.append(False)
107
111
  total_score += 2
108
- return {'score': correct_score / total_score * 100}
112
+ details[str(index)] = {
113
+ 'pred': pred,
114
+ 'refr': refr,
115
+ 'is_correct': all(is_corrects),
116
+ }
117
+
109
118
  else:
119
+ details = {}
110
120
  correct_score, total_score = 0, 0
111
- for pred, refr in zip(predictions, references):
121
+ for index, (pred, refr) in enumerate(zip(predictions, references)):
112
122
  if self.question_type == 'multi_question_choice':
113
123
  pred = self.do_predictions_postprocess(pred, len(refr))
114
124
  else:
115
125
  pred = self.do_predictions_postprocess(pred)
116
126
  pred = self.ensure_same_length(pred, refr)
127
+ is_corrects = []
117
128
  for p, r in zip(pred, refr):
118
- if p == r:
119
- correct_score += 1
129
+ is_correct = p == r
130
+ correct_score += is_correct
120
131
  total_score += 1
121
- return {'score': correct_score / total_score * 100}
132
+ is_corrects.append(is_correct)
133
+ details[str(index)] = {
134
+ 'pred': pred,
135
+ 'refr': refr,
136
+ 'is_correct': all(is_corrects),
137
+ }
138
+ return {'score': correct_score / total_score * 100, 'details': details}
122
139
 
123
140
 
124
141
  for question_type in valid_gaokao_bench_question_types:
@@ -0,0 +1,33 @@
1
+ import csv
2
+ import os.path as osp
3
+
4
+ from datasets import Dataset, DatasetDict
5
+
6
+ from opencompass.registry import LOAD_DATASET
7
+
8
+ from .base import BaseDataset
9
+
10
+
11
+ @LOAD_DATASET.register_module()
12
+ class MMLUArabicDataset(BaseDataset):
13
+
14
+ @staticmethod
15
+ def load(path: str, name: str):
16
+ dataset = DatasetDict()
17
+ for split in ['dev', 'test']:
18
+ raw_data = []
19
+ filename = osp.join(path, split, f'{name}_{split}.csv')
20
+ with open(filename, encoding='utf-8') as f:
21
+ reader = csv.reader(f)
22
+ for row in reader:
23
+ assert len(row) == 6
24
+ raw_data.append({
25
+ 'input': row[0],
26
+ 'A': row[1],
27
+ 'B': row[2],
28
+ 'C': row[3],
29
+ 'D': row[4],
30
+ 'target': row[5],
31
+ })
32
+ dataset[split] = Dataset.from_list(raw_data)
33
+ return dataset
@@ -1,6 +1,10 @@
1
1
  import ast
2
2
 
3
- import networkx as nx
3
+ try:
4
+ import networkx as nx
5
+ except ImportError:
6
+ nx = None
7
+
4
8
  from datasets import Dataset
5
9
 
6
10
  from opencompass.openicl.icl_evaluator import BaseEvaluator
@@ -1,7 +1,11 @@
1
1
  import ast
2
2
  import json
3
3
 
4
- import networkx as nx
4
+ try:
5
+ import networkx as nx
6
+ except ImportError:
7
+ nx = None
8
+
5
9
  import pandas as pd
6
10
  from datasets import Dataset
7
11
 
@@ -1,7 +1,11 @@
1
1
  import ast
2
2
  import json
3
3
 
4
- import networkx as nx
4
+ try:
5
+ import networkx as nx
6
+ except ImportError:
7
+ nx = None
8
+
5
9
  from datasets import Dataset
6
10
 
7
11
  from opencompass.openicl.icl_evaluator import BaseEvaluator
@@ -0,0 +1,59 @@
1
+ import json
2
+
3
+ from datasets import Dataset
4
+
5
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
6
+ from opencompass.registry import LOAD_DATASET
7
+
8
+ from .base import BaseDataset
9
+
10
+
11
+ @LOAD_DATASET.register_module()
12
+ class QuALITYDataset(BaseDataset):
13
+
14
+ @staticmethod
15
+ def load(path: str):
16
+ dataset_list = []
17
+ with open(path, 'r', encoding='utf-8') as f:
18
+ for line in f:
19
+ line = json.loads(line)
20
+ for question in line['questions']:
21
+ dataset_list.append({
22
+ 'article':
23
+ line['article'],
24
+ 'question':
25
+ question['question'],
26
+ 'A':
27
+ question['options'][0],
28
+ 'B':
29
+ question['options'][1],
30
+ 'C':
31
+ question['options'][2],
32
+ 'D':
33
+ question['options'][3],
34
+ 'gold_label':
35
+ 'ABCD'[question['gold_label'] - 1],
36
+ 'difficult':
37
+ question['difficult']
38
+ })
39
+ return Dataset.from_list(dataset_list)
40
+
41
+
42
+ class QuALITYEvaluator(BaseEvaluator):
43
+
44
+ def score(self, predictions, references, test_set):
45
+ assert len(predictions) == len(references)
46
+ easy, hard, all = [], [], []
47
+ for pred, refer, test in zip(predictions, references, test_set):
48
+ if pred == refer:
49
+ answer = True
50
+ else:
51
+ answer = False
52
+ all.append(answer)
53
+ if test['difficult'] == 0:
54
+ easy.append(answer)
55
+ else:
56
+ hard.append(answer)
57
+ return dict(easy_acc=sum(easy) / len(easy) * 100,
58
+ hard_acc=sum(hard) / len(easy) * 100,
59
+ all_acc=sum(all) / len(all) * 100)
@@ -0,0 +1,4 @@
1
+ from .legacy import (TheoremQA_postprocess, TheoremQA_postprocess_v2,
2
+ TheoremQADataset)
3
+ from .main import (TheoremQA_postprocess_v3, TheoremQADatasetV3,
4
+ TheoremQAEvaluatorV3)
@@ -4,7 +4,7 @@ from datasets import load_dataset
4
4
 
5
5
  from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
6
6
 
7
- from .base import BaseDataset
7
+ from ..base import BaseDataset
8
8
 
9
9
 
10
10
  @LOAD_DATASET.register_module()
@@ -0,0 +1,66 @@
1
+ import re
2
+ import json
3
+
4
+ from datasets import Dataset, DatasetDict
5
+
6
+ from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS, ICL_EVALUATORS
7
+
8
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
9
+ from ..base import BaseDataset
10
+ from . import utils
11
+ from tqdm import tqdm
12
+
13
+
14
+ @LOAD_DATASET.register_module()
15
+ class TheoremQADatasetV3(BaseDataset):
16
+
17
+ @staticmethod
18
+ def load(path: str):
19
+ with open(path, 'r') as f:
20
+ data = json.load(f)
21
+ for item in data:
22
+ item['Answer'] = str(item['Answer'])
23
+ dataset = Dataset.from_list(data)
24
+ return dataset
25
+
26
+
27
+ def TheoremQA_postprocess_v3(text: str) -> str:
28
+ answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text)
29
+ return answer
30
+
31
+
32
+ @ICL_EVALUATORS.register_module()
33
+ class TheoremQAEvaluatorV3(BaseEvaluator):
34
+ def score(self, predictions, references, test_set):
35
+ if len(predictions) != len(references):
36
+ return {"error": "preds and refrs have different length"}
37
+
38
+ details = []
39
+ correct, wrong = 0, 0
40
+ for index in tqdm(range(len(predictions))):
41
+ answer = predictions[index]
42
+ groundtruth = references[index]
43
+ answer_type = test_set[index]['Answer_type']
44
+ if answer_type in ['float', 'integer', 'bool']:
45
+ groundtruth = [groundtruth, eval(groundtruth)]
46
+ else:
47
+ groundtruth = [groundtruth, None]
48
+ if utils.compare_answer_with_groundtruth(answer, *groundtruth):
49
+ correct += 1
50
+ is_correct = True
51
+ else:
52
+ wrong += 1
53
+ is_correct = False
54
+
55
+ details.append(
56
+ {
57
+ # "question": question,
58
+ # "solution": output,
59
+ "correct": groundtruth,
60
+ "pred": answer,
61
+ "is_correct": is_correct,
62
+ }
63
+ )
64
+
65
+ score = correct / (correct + wrong) * 100
66
+ return {'score': score, 'details': details}