opencompass 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. {opencompass-0.2.2 → opencompass-0.2.4}/PKG-INFO +74 -29
  2. {opencompass-0.2.2 → opencompass-0.2.4}/README.md +73 -28
  3. opencompass-0.2.4/opencompass/__init__.py +1 -0
  4. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/cmp_GCP_D.py +5 -1
  5. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/cmp_TSP_D.py +5 -1
  6. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/p_SPP.py +5 -1
  7. opencompass-0.2.4/opencompass/datasets/OpenFinData.py +47 -0
  8. opencompass-0.2.4/opencompass/datasets/QuALITY.py +59 -0
  9. opencompass-0.2.4/opencompass/datasets/TheoremQA/__init__.py +4 -0
  10. opencompass-0.2.2/opencompass/datasets/TheoremQA.py → opencompass-0.2.4/opencompass/datasets/TheoremQA/legacy.py +13 -1
  11. opencompass-0.2.4/opencompass/datasets/TheoremQA/main.py +66 -0
  12. opencompass-0.2.4/opencompass/datasets/TheoremQA/number_utils.py +98 -0
  13. opencompass-0.2.4/opencompass/datasets/TheoremQA/utils.py +110 -0
  14. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/__init__.py +7 -1
  15. opencompass-0.2.4/opencompass/datasets/apps.py +877 -0
  16. opencompass-0.2.4/opencompass/datasets/chembench.py +34 -0
  17. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/custom.py +10 -0
  18. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/gpqa.py +10 -32
  19. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/hellaswag.py +27 -1
  20. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/humaneval.py +5 -2
  21. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/humanevalx.py +4 -1
  22. opencompass-0.2.4/opencompass/datasets/lveval/__init__.py +14 -0
  23. opencompass-0.2.4/opencompass/datasets/lveval/evaluators.py +409 -0
  24. opencompass-0.2.4/opencompass/datasets/lveval/lveval_cmrc_mixup.py +28 -0
  25. opencompass-0.2.4/opencompass/datasets/lveval/lveval_dureader_mixup.py +26 -0
  26. opencompass-0.2.4/opencompass/datasets/lveval/lveval_factrecall_en.py +28 -0
  27. opencompass-0.2.4/opencompass/datasets/lveval/lveval_factrecall_zh.py +28 -0
  28. opencompass-0.2.4/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py +31 -0
  29. opencompass-0.2.4/opencompass/datasets/lveval/lveval_lic_mixup.py +31 -0
  30. opencompass-0.2.4/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py +29 -0
  31. opencompass-0.2.4/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py +29 -0
  32. opencompass-0.2.4/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py +29 -0
  33. opencompass-0.2.4/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py +31 -0
  34. opencompass-0.2.4/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py +31 -0
  35. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/math.py +19 -6
  36. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/mathbench.py +1 -1
  37. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/mbpp.py +45 -40
  38. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/natural_question.py +4 -3
  39. opencompass-0.2.4/opencompass/datasets/taco.py +823 -0
  40. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/winogrande.py +33 -3
  41. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/__init__.py +9 -1
  42. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/ai360_api.py +4 -4
  43. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/baichuan_api.py +128 -4
  44. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/baidu_api.py +4 -4
  45. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/base.py +2 -2
  46. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/base_api.py +4 -4
  47. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/bytedance_api.py +4 -4
  48. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/claude_api/claude_api.py +4 -4
  49. opencompass-0.2.4/opencompass/models/gemini_api.py +251 -0
  50. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/huggingface.py +1 -1
  51. opencompass-0.2.4/opencompass/models/hunyuan_api.py +121 -0
  52. opencompass-0.2.4/opencompass/models/krgpt_api.py +134 -0
  53. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/lightllm_api.py +38 -5
  54. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/llama2.py +1 -1
  55. opencompass-0.2.2/opencompass/models/turbomind.py → opencompass-0.2.4/opencompass/models/lmdeploy_pytorch.py +27 -29
  56. opencompass-0.2.4/opencompass/models/lmdeploy_tis.py +200 -0
  57. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/minimax_api.py +4 -4
  58. opencompass-0.2.4/opencompass/models/mistral_api.py +123 -0
  59. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/moonshot_api.py +24 -26
  60. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/nanbeige_api.py +4 -4
  61. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/openai_api.py +49 -17
  62. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/pangu_api.py +4 -4
  63. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/qwen_api.py +28 -14
  64. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/sensetime_api.py +14 -9
  65. opencompass-0.2.4/opencompass/models/turbomind.py +219 -0
  66. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/turbomind_api.py +27 -19
  67. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/turbomind_tis.py +10 -4
  68. opencompass-0.2.4/opencompass/models/unigpt_api.py +147 -0
  69. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/vllm.py +6 -0
  70. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/xunfei_api.py +4 -4
  71. opencompass-0.2.4/opencompass/models/yayi_api.py +261 -0
  72. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/zhipuai_api.py +4 -4
  73. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/zhipuai_v2_api.py +12 -6
  74. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/__init__.py +1 -0
  75. opencompass-0.2.4/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py +97 -0
  76. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/lm_evaluator.py +57 -23
  77. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +8 -6
  78. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py +2 -2
  79. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_prompt_template.py +4 -4
  80. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/partitioners/base.py +18 -7
  81. opencompass-0.2.4/opencompass/partitioners/sub_naive.py +220 -0
  82. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/partitioners/sub_size.py +29 -6
  83. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/registry.py +15 -1
  84. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/runners/base.py +2 -1
  85. opencompass-0.2.4/opencompass/runners/dlc.py +289 -0
  86. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/runners/local.py +23 -9
  87. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/runners/local_api.py +1 -1
  88. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/runners/slurm.py +1 -1
  89. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/runners/slurm_sequential.py +1 -1
  90. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/default.py +1 -1
  91. opencompass-0.2.4/opencompass/summarizers/needlebench.py +737 -0
  92. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/alignmentbench.py +43 -33
  93. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/alpacaeval.py +2 -1
  94. opencompass-0.2.4/opencompass/summarizers/subjective/compass_arena.py +240 -0
  95. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/mtbench.py +55 -44
  96. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/multiround.py +2 -1
  97. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/tasks/openicl_eval.py +3 -2
  98. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/tasks/openicl_infer.py +12 -5
  99. opencompass-0.2.4/opencompass/tasks/subjective_eval.py +438 -0
  100. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/abbr.py +22 -0
  101. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/build.py +1 -0
  102. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/prompt.py +5 -5
  103. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/run.py +140 -12
  104. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/text_postprocessors.py +5 -5
  105. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass.egg-info/PKG-INFO +74 -29
  106. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass.egg-info/SOURCES.txt +34 -1
  107. opencompass-0.2.4/opencompass.egg-info/entry_points.txt +3 -0
  108. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass.egg-info/requires.txt +4 -1
  109. {opencompass-0.2.2 → opencompass-0.2.4}/setup.py +40 -33
  110. opencompass-0.2.2/opencompass/__init__.py +0 -1
  111. opencompass-0.2.2/opencompass/partitioners/sub_naive.py +0 -110
  112. opencompass-0.2.2/opencompass/runners/dlc.py +0 -229
  113. opencompass-0.2.2/opencompass/summarizers/subjective/compass_arena.py +0 -204
  114. opencompass-0.2.2/opencompass/tasks/subjective_eval.py +0 -282
  115. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/FinanceIQ.py +0 -0
  116. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/GaokaoBench.py +0 -0
  117. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/__init__.py +0 -0
  118. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/cmp_KSP.py +0 -0
  119. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/hard_GCP.py +0 -0
  120. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/hard_MSP.py +0 -0
  121. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/hard_TSP.py +0 -0
  122. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/p_BSP.py +0 -0
  123. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/p_EDP.py +0 -0
  124. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/prompts.py +0 -0
  125. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/utils.py +0 -0
  126. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/advglue.py +0 -0
  127. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/afqmcd.py +0 -0
  128. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/agieval/__init__.py +0 -0
  129. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/agieval/agieval.py +0 -0
  130. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/agieval/constructions.py +0 -0
  131. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/agieval/dataset_loader.py +0 -0
  132. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/agieval/evaluation.py +0 -0
  133. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/agieval/math_equivalence.py +0 -0
  134. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/agieval/post_process.py +0 -0
  135. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/agieval/utils.py +0 -0
  136. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/anli.py +0 -0
  137. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/anthropics_evals.py +0 -0
  138. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/arc.py +0 -0
  139. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/ax.py +0 -0
  140. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/base.py +0 -0
  141. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/bbh.py +0 -0
  142. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/boolq.py +0 -0
  143. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/bustum.py +0 -0
  144. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/c3.py +0 -0
  145. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/cb.py +0 -0
  146. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/ceval.py +0 -0
  147. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/chid.py +0 -0
  148. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/cibench.py +0 -0
  149. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/circular.py +0 -0
  150. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/civilcomments.py +0 -0
  151. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/clozeTest_maxmin.py +0 -0
  152. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/cluewsc.py +0 -0
  153. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/cmb.py +0 -0
  154. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/cmmlu.py +0 -0
  155. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/cmnli.py +0 -0
  156. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/cmrc.py +0 -0
  157. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/commonsenseqa.py +0 -0
  158. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/commonsenseqa_cn.py +0 -0
  159. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/copa.py +0 -0
  160. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/crowspairs.py +0 -0
  161. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/crowspairs_cn.py +0 -0
  162. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/csl.py +0 -0
  163. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/cvalues.py +0 -0
  164. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/drcd.py +0 -0
  165. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/drop.py +0 -0
  166. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/ds1000.py +0 -0
  167. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/ds1000_interpreter.py +0 -0
  168. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/eprstmt.py +0 -0
  169. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/flores.py +0 -0
  170. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/game24.py +0 -0
  171. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/govrepcrs.py +0 -0
  172. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/gsm8k.py +0 -0
  173. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/gsm_hard.py +0 -0
  174. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/huggingface.py +0 -0
  175. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/humaneval_multi.py +0 -0
  176. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/hungarian_math.py +0 -0
  177. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/__init__.py +0 -0
  178. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_codedebug.py +0 -0
  179. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_coderun.py +0 -0
  180. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_endia.py +0 -0
  181. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_enmc.py +0 -0
  182. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_enqa.py +0 -0
  183. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_ensum.py +0 -0
  184. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_mathcalc.py +0 -0
  185. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_mathfind.py +0 -0
  186. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_retrievekv.py +0 -0
  187. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py +0 -0
  188. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py +0 -0
  189. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_zhqa.py +0 -0
  190. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/infinitebench/utils.py +0 -0
  191. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/iwslt2017.py +0 -0
  192. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/jigsawmultilingual.py +0 -0
  193. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/jsonl.py +0 -0
  194. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/kaoshi.py +0 -0
  195. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/lambada.py +0 -0
  196. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/lawbench/__init__.py +0 -0
  197. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/lawbench/lawbench.py +0 -0
  198. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/lcsts.py +0 -0
  199. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/__init__.py +0 -0
  200. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/evaluators.py +0 -0
  201. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_coursera.py +0 -0
  202. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_financial_qa.py +0 -0
  203. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_gov_report_summ.py +0 -0
  204. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_gsm100.py +0 -0
  205. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_legal_contract_qa.py +0 -0
  206. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_meeting_summ.py +0 -0
  207. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_multidoc_qa.py +0 -0
  208. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_narrattive_qa.py +0 -0
  209. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_natural_question.py +0 -0
  210. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_news_summ.py +0 -0
  211. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_paper_assistant.py +0 -0
  212. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_patent_summ.py +0 -0
  213. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_quality.py +0 -0
  214. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_review_summ.py +0 -0
  215. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_scientific_qa.py +0 -0
  216. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_topic_retrieval.py +0 -0
  217. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_tpo.py +0 -0
  218. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/leval/leval_tvshow_summ.py +0 -0
  219. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/lmeval.py +0 -0
  220. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/__init__.py +0 -0
  221. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/evaluators.py +0 -0
  222. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_2wikim_qa.py +0 -0
  223. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_dureader.py +0 -0
  224. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_gov_report.py +0 -0
  225. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_hotpot_qa.py +0 -0
  226. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_lcc.py +0 -0
  227. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_lsht.py +0 -0
  228. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_multi_news.py +0 -0
  229. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_multifieldqa_en.py +0 -0
  230. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_multifieldqa_zh.py +0 -0
  231. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_musique.py +0 -0
  232. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_narrative_qa.py +0 -0
  233. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_passage_count.py +0 -0
  234. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_passage_retrieval_en.py +0 -0
  235. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py +0 -0
  236. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_qasper.py +0 -0
  237. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_qmsum.py +0 -0
  238. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_repobench.py +0 -0
  239. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_samsum.py +0 -0
  240. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_trec.py +0 -0
  241. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_trivia_qa.py +0 -0
  242. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_vcsum.py +0 -0
  243. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/mastermath2024v1.py +0 -0
  244. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/math401.py +0 -0
  245. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/math_intern.py +0 -0
  246. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/medbench/__init__.py +0 -0
  247. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/medbench/constructions.py +0 -0
  248. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/medbench/dataset_loader.py +0 -0
  249. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/medbench/evaluation.py +0 -0
  250. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/medbench/math_equivalence.py +0 -0
  251. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/medbench/medbench.py +0 -0
  252. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/medbench/post_process.py +0 -0
  253. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/medbench/utils.py +0 -0
  254. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/mmlu.py +0 -0
  255. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/multirc.py +0 -0
  256. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/narrativeqa.py +0 -0
  257. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/natural_question_cn.py +0 -0
  258. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/obqa.py +0 -0
  259. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/piqa.py +0 -0
  260. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/py150.py +0 -0
  261. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/qasper.py +0 -0
  262. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/qaspercut.py +0 -0
  263. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/race.py +0 -0
  264. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/realtoxicprompts.py +0 -0
  265. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/reasonbench/ReasonBenchDataset.py +0 -0
  266. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/reasonbench/__init__.py +0 -0
  267. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/record.py +0 -0
  268. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/rolebench.py +0 -0
  269. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/safety.py +0 -0
  270. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/scibench.py +0 -0
  271. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/siqa.py +0 -0
  272. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/squad20.py +0 -0
  273. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/storycloze.py +0 -0
  274. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/strategyqa.py +0 -0
  275. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/__init__.py +0 -0
  276. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/alignbench.py +0 -0
  277. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/compass_arena.py +0 -0
  278. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/corev2.py +0 -0
  279. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/creationbench.py +0 -0
  280. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/information_retrival.py +0 -0
  281. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/mtbench.py +0 -0
  282. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/multiround.py +0 -0
  283. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/subjective/subjective_cmp.py +0 -0
  284. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/summedits.py +0 -0
  285. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/summscreen.py +0 -0
  286. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/svamp.py +0 -0
  287. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/tabmwp.py +0 -0
  288. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/__init__.py +0 -0
  289. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/__init__.py +0 -0
  290. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/instruct_evaluator.py +0 -0
  291. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/planning_evaluator.py +0 -0
  292. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py +0 -0
  293. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/review_evaluator.py +0 -0
  294. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/schema.py +0 -0
  295. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/utils/__init__.py +0 -0
  296. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/utils/convert_results.py +0 -0
  297. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/utils/format_load.py +0 -0
  298. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/utils/meta_template.py +0 -0
  299. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/teval/utils/template.py +0 -0
  300. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/tnews.py +0 -0
  301. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/triviaqa.py +0 -0
  302. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/triviaqarc.py +0 -0
  303. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/truthfulqa.py +0 -0
  304. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/tydiqa.py +0 -0
  305. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/wic.py +0 -0
  306. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/wikibench.py +0 -0
  307. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/winograd.py +0 -0
  308. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/wnli.py +0 -0
  309. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/wsc.py +0 -0
  310. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/xcopa.py +0 -0
  311. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/xiezhi.py +0 -0
  312. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/xlsum.py +0 -0
  313. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/datasets/xsum.py +0 -0
  314. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/metrics/__init__.py +0 -0
  315. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/metrics/dump_results.py +0 -0
  316. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/metrics/mme_score.py +0 -0
  317. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/metrics/seedbench.py +0 -0
  318. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/accessory.py +0 -0
  319. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/alaya.py +0 -0
  320. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/claude_api/__init__.py +0 -0
  321. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/claude_api/postprocessors.py +0 -0
  322. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/glm.py +0 -0
  323. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/intern_model.py +0 -0
  324. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/lagent.py +0 -0
  325. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/langchain.py +0 -0
  326. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/mixtral.py +0 -0
  327. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/models/modelscope.py +0 -0
  328. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/__init__.py +0 -0
  329. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_dataset_reader.py +0 -0
  330. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_agent_evaluator.py +0 -0
  331. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py +0 -0
  332. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +0 -0
  333. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_circular_evaluator.py +0 -0
  334. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_em_evaluator.py +0 -0
  335. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +0 -0
  336. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_jieba_rouge_evaluator.py +0 -0
  337. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py +0 -0
  338. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +0 -0
  339. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/__init__.py +0 -0
  340. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py +0 -0
  341. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_attack_inferencer.py +0 -0
  342. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +0 -0
  343. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py +0 -0
  344. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py +0 -0
  345. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py +0 -0
  346. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_mink_percent_inferencer.py +0 -0
  347. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py +0 -0
  348. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_sc_inferencer.py +0 -0
  349. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_tot_inferencer.py +0 -0
  350. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/__init__.py +0 -0
  351. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_base_retriever.py +0 -0
  352. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_bm25_retriever.py +0 -0
  353. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_dpp_retriever.py +0 -0
  354. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_fix_k_retriever.py +0 -0
  355. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_mdl_retriever.py +0 -0
  356. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_random_retriever.py +0 -0
  357. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_topk_retriever.py +0 -0
  358. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_votek_retriever.py +0 -0
  359. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_zero_retriever.py +0 -0
  360. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/utils/__init__.py +0 -0
  361. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/openicl/utils/logging.py +0 -0
  362. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/partitioners/__init__.py +0 -0
  363. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/partitioners/mm_naive.py +0 -0
  364. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/partitioners/naive.py +0 -0
  365. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/partitioners/num_worker.py +0 -0
  366. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/partitioners/size.py +0 -0
  367. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/runners/__init__.py +0 -0
  368. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/__init__.py +0 -0
  369. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/circular.py +0 -0
  370. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/multi_model.py +0 -0
  371. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/__init__.py +0 -0
  372. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/corev2.py +0 -0
  373. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/creationbench.py +0 -0
  374. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/information_retrival.py +0 -0
  375. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/subjective_post_process.py +0 -0
  376. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/subjective/utils.py +0 -0
  377. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/summarizers/summarizer_pretrain.py +0 -0
  378. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/tasks/__init__.py +0 -0
  379. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/tasks/base.py +0 -0
  380. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/tasks/llm_eval.py +0 -0
  381. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/tasks/mm_infer.py +0 -0
  382. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/tasks/openicl_attack.py +0 -0
  383. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/__init__.py +0 -0
  384. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/auxiliary.py +0 -0
  385. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/collect_env.py +0 -0
  386. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/dependency.py +0 -0
  387. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/file.py +0 -0
  388. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/fileio.py +0 -0
  389. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/lark.py +0 -0
  390. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/logging.py +0 -0
  391. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/menu.py +0 -0
  392. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass/utils/types.py +0 -0
  393. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass.egg-info/dependency_links.txt +0 -0
  394. {opencompass-0.2.2 → opencompass-0.2.4}/opencompass.egg-info/top_level.txt +0 -0
  395. {opencompass-0.2.2 → opencompass-0.2.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opencompass
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: A comprehensive toolkit for large model evaluation
5
5
  Home-page: https://github.com/open-compass/opencompass
6
6
  Author: OpenCompass Contributors
@@ -11,37 +11,55 @@ Description: <div align="center">
11
11
  <br />
12
12
  <br />
13
13
 
14
- [![docs](https://readthedocs.org/projects/opencompass/badge)](https://opencompass.readthedocs.io/en)
15
- [![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](https://github.com/open-compass/opencompass/blob/main/LICENSE)
14
+ [![][github-release-shield]][github-release-link]
15
+ [![][github-releasedate-shield]][github-releasedate-link]
16
+ [![][github-contributors-shield]][github-contributors-link]<br>
17
+ [![][github-forks-shield]][github-forks-link]
18
+ [![][github-stars-shield]][github-stars-link]
19
+ [![][github-issues-shield]][github-issues-link]
20
+ [![][github-license-shield]][github-license-link]
16
21
 
17
22
  <!-- [![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/) -->
18
23
 
19
24
  [🌐Website](https://opencompass.org.cn/) |
25
+ [📖CompassHub](https://hub.opencompass.org.cn/home) |
26
+ [📊CompassRank](https://rank.opencompass.org.cn/home) |
20
27
  [📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
21
28
  [🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) |
22
29
  [🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
23
30
 
24
31
  English | [简体中文](README_zh-CN.md)
25
32
 
33
+ [![][github-trending-shield]][github-trending-url]
34
+
26
35
  </div>
27
36
 
28
37
  <p align="center">
29
38
  👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
30
39
  </p>
31
40
 
32
- ## 📣 OpenCompass 2023 LLM Annual Leaderboard
41
+ > \[!IMPORTANT\]
42
+ >
43
+ > **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
33
44
 
34
- We are honored to have witnessed the tremendous progress of artificial general intelligence together with the community in the past year, and we are also very pleased that **OpenCompass** can help numerous developers and users.
45
+ ## 📣 OpenCompass 2.0
35
46
 
36
- We announce the launch of the **OpenCompass 2023 LLM Annual Leaderboard** plan. We expect to release the annual leaderboard of the LLMs in January 2024, systematically evaluating the performance of LLMs in various capabilities such as language, knowledge, reasoning, creation, long-text, and agents.
47
+ We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
48
+ ![oc20](https://github.com/tonysy/opencompass/assets/7881589/90dbe1c0-c323-470a-991e-2b37ab5350b2)
37
49
 
38
- At that time, we will release rankings for both open-source models and commercial API models, aiming to provide a comprehensive, objective, and neutral reference for the industry and research community.
50
+ **CompassRank** has been significantly enhanced into the leaderboards that now incorporates both open-source benchmarks and proprietary benchmarks. This upgrade allows for a more comprehensive evaluation of models across the industry.
39
51
 
40
- We sincerely invite various large models to join the OpenCompass to showcase their performance advantages in different fields. At the same time, we also welcome researchers and developers to provide valuable suggestions and contributions to jointly promote the development of the LLMs. If you have any questions or needs, please feel free to [contact us](mailto:opencompass@pjlab.org.cn). In addition, relevant evaluation contents, performance statistics, and evaluation methods will be open-source along with the leaderboard release.
52
+ **CompassHub** presents a pioneering benchmark browser interface, designed to simplify and expedite the exploration and utilization of an extensive array of benchmarks for researchers and practitioners alike. To enhance the visibility of your own benchmark within the community, we warmly invite you to contribute it to CompassHub. You may initiate the submission process by clicking [here](https://hub.opencompass.org.cn/dataset-submit).
41
53
 
42
- We have provided the more details of the CompassBench 2023 in [Doc](docs/zh_cn/advanced_guides/compassbench_intro.md).
54
+ **CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
43
55
 
44
- Let's look forward to the release of the OpenCompass 2023 LLM Annual Leaderboard!
56
+ <details>
57
+ <summary><kbd>Star History</kbd></summary>
58
+ <picture>
59
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
60
+ <img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
61
+ </picture>
62
+ </details>
45
63
 
46
64
  ## 🧭 Welcome
47
65
 
@@ -60,12 +78,9 @@ Description: <div align="center">
60
78
 
61
79
  ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
62
80
 
63
- - **\[2024.01.17\]** We supported the evaluation of [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_keyset.py) and [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py), InternLM2 showed extremely strong performance in these tests, welcome to try! 🔥🔥🔥.
64
- - **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8) 🔥🔥🔥.
65
- - **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development. 🔥🔥🔥.
66
- - **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details! 🔥🔥🔥.
67
- - **\[2023.12.10\]** We have released [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), a toolkit for evaluating vision-language models (VLMs), currently support 20+ VLMs and 7 multi-modal benchmarks (including MMBench series).
68
- - **\[2023.12.10\]** We have supported Mistral AI's MoE LLM: **Mixtral-8x7B-32K**. Welcome to [MixtralKit](https://github.com/open-compass/MixtralKit) for more details about inference and evaluation.
81
+ - **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
82
+ - **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
83
+ - **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
69
84
 
70
85
  > [More](docs/en/notes/news.md)
71
86
 
@@ -87,7 +102,7 @@ Description: <div align="center">
87
102
 
88
103
  ## 📊 Leaderboard
89
104
 
90
- We provide [OpenCompass Leaderboard](https://opencompass.org.cn/rank) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
105
+ We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
91
106
 
92
107
  <p align="right"><a href="#top">🔝Back to top</a></p>
93
108
 
@@ -122,8 +137,8 @@ Description: <div align="center">
122
137
 
123
138
  ```bash
124
139
  # Download dataset to data/ folder
125
- wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
126
- unzip OpenCompassData-core-20231110.zip
140
+ wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
141
+ unzip OpenCompassData-core-20240207.zip
127
142
  ```
128
143
 
129
144
  Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).
@@ -428,10 +443,6 @@ Description: <div align="center">
428
443
  </tbody>
429
444
  </table>
430
445
 
431
- ## OpenCompass Ecosystem
432
-
433
- <p align="right"><a href="#top">🔝Back to top</a></p>
434
-
435
446
  ## 📖 Model Support
436
447
 
437
448
  <table align="center">
@@ -452,6 +463,7 @@ Description: <div align="center">
452
463
 
453
464
  - [InternLM](https://github.com/InternLM/InternLM)
454
465
  - [LLaMA](https://github.com/facebookresearch/llama)
466
+ - [LLaMA3](https://github.com/meta-llama/llama3)
455
467
  - [Vicuna](https://github.com/lm-sys/FastChat)
456
468
  - [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
457
469
  - [Baichuan](https://github.com/baichuan-inc)
@@ -461,12 +473,14 @@ Description: <div align="center">
461
473
  - [TigerBot](https://github.com/TigerResearch/TigerBot)
462
474
  - [Qwen](https://github.com/QwenLM/Qwen)
463
475
  - [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
476
+ - [Gemma](https://huggingface.co/google/gemma-7b)
464
477
  - ...
465
478
 
466
479
  </td>
467
480
  <td>
468
481
 
469
482
  - OpenAI
483
+ - Gemini
470
484
  - Claude
471
485
  - ZhipuAI(ChatGLM)
472
486
  - Baichuan
@@ -489,18 +503,18 @@ Description: <div align="center">
489
503
 
490
504
  ## 🔜 Roadmap
491
505
 
492
- - [ ] Subjective Evaluation
506
+ - [x] Subjective Evaluation
493
507
  - [ ] Release CompassAreana
494
- - [ ] Subjective evaluation dataset.
508
+ - [x] Subjective evaluation.
495
509
  - [x] Long-context
496
- - [ ] Long-context evaluation with extensive datasets.
510
+ - [x] Long-context evaluation with extensive datasets.
497
511
  - [ ] Long-context leaderboard.
498
- - [ ] Coding
512
+ - [x] Coding
499
513
  - [ ] Coding evaluation leaderboard.
500
514
  - [x] Non-python language evaluation service.
501
- - [ ] Agent
515
+ - [x] Agent
502
516
  - [ ] Support various agenet framework.
503
- - [ ] Evaluation of tool use of the LLMs.
517
+ - [x] Evaluation of tool use of the LLMs.
504
518
  - [x] Robustness
505
519
  - [x] Support various attack method
506
520
 
@@ -508,6 +522,20 @@ Description: <div align="center">
508
522
 
509
523
  We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
510
524
 
525
+ <!-- Copy-paste in your Readme.md file -->
526
+
527
+ <!-- Made with [OSS Insight](https://ossinsight.io/) -->
528
+
529
+ <a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
530
+ <table>
531
+ <tr>
532
+ <th colspan="2">
533
+ <br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
534
+ </th>
535
+ </tr>
536
+ </table>
537
+ </a>
538
+
511
539
  ## 🤝 Acknowledgements
512
540
 
513
541
  Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
@@ -527,6 +555,23 @@ Description: <div align="center">
527
555
 
528
556
  <p align="right"><a href="#top">🔝Back to top</a></p>
529
557
 
558
+ [github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
559
+ [github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
560
+ [github-forks-link]: https://github.com/open-compass/opencompass/network/members
561
+ [github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
562
+ [github-issues-link]: https://github.com/open-compass/opencompass/issues
563
+ [github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
564
+ [github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
565
+ [github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
566
+ [github-release-link]: https://github.com/open-compass/opencompass/releases
567
+ [github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
568
+ [github-releasedate-link]: https://github.com/open-compass/opencompass/releases
569
+ [github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
570
+ [github-stars-link]: https://github.com/open-compass/opencompass/stargazers
571
+ [github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
572
+ [github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
573
+ [github-trending-url]: https://trendshift.io/repositories/6630
574
+
530
575
  Keywords: AI,NLP,in-context learning,large language model,evaluation,benchmark,llm
531
576
  Platform: UNKNOWN
532
577
  Classifier: Programming Language :: Python :: 3.8
@@ -3,37 +3,55 @@
3
3
  <br />
4
4
  <br />
5
5
 
6
- [![docs](https://readthedocs.org/projects/opencompass/badge)](https://opencompass.readthedocs.io/en)
7
- [![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](https://github.com/open-compass/opencompass/blob/main/LICENSE)
6
+ [![][github-release-shield]][github-release-link]
7
+ [![][github-releasedate-shield]][github-releasedate-link]
8
+ [![][github-contributors-shield]][github-contributors-link]<br>
9
+ [![][github-forks-shield]][github-forks-link]
10
+ [![][github-stars-shield]][github-stars-link]
11
+ [![][github-issues-shield]][github-issues-link]
12
+ [![][github-license-shield]][github-license-link]
8
13
 
9
14
  <!-- [![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/) -->
10
15
 
11
16
  [🌐Website](https://opencompass.org.cn/) |
17
+ [📖CompassHub](https://hub.opencompass.org.cn/home) |
18
+ [📊CompassRank](https://rank.opencompass.org.cn/home) |
12
19
  [📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
13
20
  [🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) |
14
21
  [🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
15
22
 
16
23
  English | [简体中文](README_zh-CN.md)
17
24
 
25
+ [![][github-trending-shield]][github-trending-url]
26
+
18
27
  </div>
19
28
 
20
29
  <p align="center">
21
30
  👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
22
31
  </p>
23
32
 
24
- ## 📣 OpenCompass 2023 LLM Annual Leaderboard
33
+ > \[!IMPORTANT\]
34
+ >
35
+ > **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
25
36
 
26
- We are honored to have witnessed the tremendous progress of artificial general intelligence together with the community in the past year, and we are also very pleased that **OpenCompass** can help numerous developers and users.
37
+ ## 📣 OpenCompass 2.0
27
38
 
28
- We announce the launch of the **OpenCompass 2023 LLM Annual Leaderboard** plan. We expect to release the annual leaderboard of the LLMs in January 2024, systematically evaluating the performance of LLMs in various capabilities such as language, knowledge, reasoning, creation, long-text, and agents.
39
+ We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
40
+ ![oc20](https://github.com/tonysy/opencompass/assets/7881589/90dbe1c0-c323-470a-991e-2b37ab5350b2)
29
41
 
30
- At that time, we will release rankings for both open-source models and commercial API models, aiming to provide a comprehensive, objective, and neutral reference for the industry and research community.
42
+ **CompassRank** has been significantly enhanced into the leaderboards that now incorporates both open-source benchmarks and proprietary benchmarks. This upgrade allows for a more comprehensive evaluation of models across the industry.
31
43
 
32
- We sincerely invite various large models to join the OpenCompass to showcase their performance advantages in different fields. At the same time, we also welcome researchers and developers to provide valuable suggestions and contributions to jointly promote the development of the LLMs. If you have any questions or needs, please feel free to [contact us](mailto:opencompass@pjlab.org.cn). In addition, relevant evaluation contents, performance statistics, and evaluation methods will be open-source along with the leaderboard release.
44
+ **CompassHub** presents a pioneering benchmark browser interface, designed to simplify and expedite the exploration and utilization of an extensive array of benchmarks for researchers and practitioners alike. To enhance the visibility of your own benchmark within the community, we warmly invite you to contribute it to CompassHub. You may initiate the submission process by clicking [here](https://hub.opencompass.org.cn/dataset-submit).
33
45
 
34
- We have provided the more details of the CompassBench 2023 in [Doc](docs/zh_cn/advanced_guides/compassbench_intro.md).
46
+ **CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
35
47
 
36
- Let's look forward to the release of the OpenCompass 2023 LLM Annual Leaderboard!
48
+ <details>
49
+ <summary><kbd>Star History</kbd></summary>
50
+ <picture>
51
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
52
+ <img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
53
+ </picture>
54
+ </details>
37
55
 
38
56
  ## 🧭 Welcome
39
57
 
@@ -52,12 +70,9 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
52
70
 
53
71
  ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
54
72
 
55
- - **\[2024.01.17\]** We supported the evaluation of [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_keyset.py) and [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py), InternLM2 showed extremely strong performance in these tests, welcome to try! 🔥🔥🔥.
56
- - **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8) 🔥🔥🔥.
57
- - **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development. 🔥🔥🔥.
58
- - **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details! 🔥🔥🔥.
59
- - **\[2023.12.10\]** We have released [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), a toolkit for evaluating vision-language models (VLMs), currently support 20+ VLMs and 7 multi-modal benchmarks (including MMBench series).
60
- - **\[2023.12.10\]** We have supported Mistral AI's MoE LLM: **Mixtral-8x7B-32K**. Welcome to [MixtralKit](https://github.com/open-compass/MixtralKit) for more details about inference and evaluation.
73
+ - **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
74
+ - **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
75
+ - **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
61
76
 
62
77
  > [More](docs/en/notes/news.md)
63
78
 
@@ -79,7 +94,7 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
79
94
 
80
95
  ## 📊 Leaderboard
81
96
 
82
- We provide [OpenCompass Leaderboard](https://opencompass.org.cn/rank) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
97
+ We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
83
98
 
84
99
  <p align="right"><a href="#top">🔝Back to top</a></p>
85
100
 
@@ -114,8 +129,8 @@ pip install -e .
114
129
 
115
130
  ```bash
116
131
  # Download dataset to data/ folder
117
- wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
118
- unzip OpenCompassData-core-20231110.zip
132
+ wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
133
+ unzip OpenCompassData-core-20240207.zip
119
134
  ```
120
135
 
121
136
  Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).
@@ -420,10 +435,6 @@ Through the command line or configuration files, OpenCompass also supports evalu
420
435
  </tbody>
421
436
  </table>
422
437
 
423
- ## OpenCompass Ecosystem
424
-
425
- <p align="right"><a href="#top">🔝Back to top</a></p>
426
-
427
438
  ## 📖 Model Support
428
439
 
429
440
  <table align="center">
@@ -444,6 +455,7 @@ Through the command line or configuration files, OpenCompass also supports evalu
444
455
 
445
456
  - [InternLM](https://github.com/InternLM/InternLM)
446
457
  - [LLaMA](https://github.com/facebookresearch/llama)
458
+ - [LLaMA3](https://github.com/meta-llama/llama3)
447
459
  - [Vicuna](https://github.com/lm-sys/FastChat)
448
460
  - [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
449
461
  - [Baichuan](https://github.com/baichuan-inc)
@@ -453,12 +465,14 @@ Through the command line or configuration files, OpenCompass also supports evalu
453
465
  - [TigerBot](https://github.com/TigerResearch/TigerBot)
454
466
  - [Qwen](https://github.com/QwenLM/Qwen)
455
467
  - [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
468
+ - [Gemma](https://huggingface.co/google/gemma-7b)
456
469
  - ...
457
470
 
458
471
  </td>
459
472
  <td>
460
473
 
461
474
  - OpenAI
475
+ - Gemini
462
476
  - Claude
463
477
  - ZhipuAI(ChatGLM)
464
478
  - Baichuan
@@ -481,18 +495,18 @@ Through the command line or configuration files, OpenCompass also supports evalu
481
495
 
482
496
  ## 🔜 Roadmap
483
497
 
484
- - [ ] Subjective Evaluation
498
+ - [x] Subjective Evaluation
485
499
  - [ ] Release CompassAreana
486
- - [ ] Subjective evaluation dataset.
500
+ - [x] Subjective evaluation.
487
501
  - [x] Long-context
488
- - [ ] Long-context evaluation with extensive datasets.
502
+ - [x] Long-context evaluation with extensive datasets.
489
503
  - [ ] Long-context leaderboard.
490
- - [ ] Coding
504
+ - [x] Coding
491
505
  - [ ] Coding evaluation leaderboard.
492
506
  - [x] Non-python language evaluation service.
493
- - [ ] Agent
507
+ - [x] Agent
494
508
  - [ ] Support various agenet framework.
495
- - [ ] Evaluation of tool use of the LLMs.
509
+ - [x] Evaluation of tool use of the LLMs.
496
510
  - [x] Robustness
497
511
  - [x] Support various attack method
498
512
 
@@ -500,6 +514,20 @@ Through the command line or configuration files, OpenCompass also supports evalu
500
514
 
501
515
  We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
502
516
 
517
+ <!-- Copy-paste in your Readme.md file -->
518
+
519
+ <!-- Made with [OSS Insight](https://ossinsight.io/) -->
520
+
521
+ <a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
522
+ <table>
523
+ <tr>
524
+ <th colspan="2">
525
+ <br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
526
+ </th>
527
+ </tr>
528
+ </table>
529
+ </a>
530
+
503
531
  ## 🤝 Acknowledgements
504
532
 
505
533
  Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
@@ -518,3 +546,20 @@ Some datasets and prompt implementations are modified from [chain-of-thought-hub
518
546
  ```
519
547
 
520
548
  <p align="right"><a href="#top">🔝Back to top</a></p>
549
+
550
+ [github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
551
+ [github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
552
+ [github-forks-link]: https://github.com/open-compass/opencompass/network/members
553
+ [github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
554
+ [github-issues-link]: https://github.com/open-compass/opencompass/issues
555
+ [github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
556
+ [github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
557
+ [github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
558
+ [github-release-link]: https://github.com/open-compass/opencompass/releases
559
+ [github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
560
+ [github-releasedate-link]: https://github.com/open-compass/opencompass/releases
561
+ [github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
562
+ [github-stars-link]: https://github.com/open-compass/opencompass/stargazers
563
+ [github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
564
+ [github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
565
+ [github-trending-url]: https://trendshift.io/repositories/6630
@@ -0,0 +1 @@
1
+ __version__ = '0.2.4'
@@ -1,6 +1,10 @@
1
1
  import ast
2
2
 
3
- import networkx as nx
3
+ try:
4
+ import networkx as nx
5
+ except ImportError:
6
+ nx = None
7
+
4
8
  from datasets import Dataset
5
9
 
6
10
  from opencompass.openicl.icl_evaluator import BaseEvaluator
@@ -1,7 +1,11 @@
1
1
  import ast
2
2
  import json
3
3
 
4
- import networkx as nx
4
+ try:
5
+ import networkx as nx
6
+ except ImportError:
7
+ nx = None
8
+
5
9
  import pandas as pd
6
10
  from datasets import Dataset
7
11
 
@@ -1,7 +1,11 @@
1
1
  import ast
2
2
  import json
3
3
 
4
- import networkx as nx
4
+ try:
5
+ import networkx as nx
6
+ except ImportError:
7
+ nx = None
8
+
5
9
  from datasets import Dataset
6
10
 
7
11
  from opencompass.openicl.icl_evaluator import BaseEvaluator
@@ -0,0 +1,47 @@
1
+ import json
2
+ import os.path as osp
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from .base import BaseDataset
10
+
11
+
12
+ @LOAD_DATASET.register_module()
13
+ class OpenFinDataDataset(BaseDataset):
14
+
15
+ @staticmethod
16
+ def load(path: str, name: str):
17
+ with open(osp.join(path, f'{name}.json'), 'r') as f:
18
+ data = json.load(f)
19
+ return Dataset.from_list(data)
20
+
21
+
22
+ @ICL_EVALUATORS.register_module()
23
+ class OpenFinDataKWEvaluator(BaseEvaluator):
24
+
25
+ def __init__(self, ):
26
+ super().__init__()
27
+
28
+ def score(self, predictions, references):
29
+ assert len(predictions) == len(references)
30
+
31
+ scores = []
32
+ results = dict()
33
+
34
+ for i in range(len(references)):
35
+ all_hit = True
36
+ judgement = references[i].split('、')
37
+ for item in judgement:
38
+ if item not in predictions[i]:
39
+ all_hit = False
40
+ break
41
+ if all_hit:
42
+ scores.append(True)
43
+ else:
44
+ scores.append(False)
45
+
46
+ results['accuracy'] = round(sum(scores) / len(scores), 4) * 100
47
+ return results
@@ -0,0 +1,59 @@
1
+ import json
2
+
3
+ from datasets import Dataset
4
+
5
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
6
+ from opencompass.registry import LOAD_DATASET
7
+
8
+ from .base import BaseDataset
9
+
10
+
11
+ @LOAD_DATASET.register_module()
12
+ class QuALITYDataset(BaseDataset):
13
+
14
+ @staticmethod
15
+ def load(path: str):
16
+ dataset_list = []
17
+ with open(path, 'r', encoding='utf-8') as f:
18
+ for line in f:
19
+ line = json.loads(line)
20
+ for question in line['questions']:
21
+ dataset_list.append({
22
+ 'article':
23
+ line['article'],
24
+ 'question':
25
+ question['question'],
26
+ 'A':
27
+ question['options'][0],
28
+ 'B':
29
+ question['options'][1],
30
+ 'C':
31
+ question['options'][2],
32
+ 'D':
33
+ question['options'][3],
34
+ 'gold_label':
35
+ 'ABCD'[question['gold_label'] - 1],
36
+ 'difficult':
37
+ question['difficult']
38
+ })
39
+ return Dataset.from_list(dataset_list)
40
+
41
+
42
+ class QuALITYEvaluator(BaseEvaluator):
43
+
44
+ def score(self, predictions, references, test_set):
45
+ assert len(predictions) == len(references)
46
+ easy, hard, all = [], [], []
47
+ for pred, refer, test in zip(predictions, references, test_set):
48
+ if pred == refer:
49
+ answer = True
50
+ else:
51
+ answer = False
52
+ all.append(answer)
53
+ if test['difficult'] == 0:
54
+ easy.append(answer)
55
+ else:
56
+ hard.append(answer)
57
+ return dict(easy_acc=sum(easy) / len(easy) * 100,
58
+ hard_acc=sum(hard) / len(easy) * 100,
59
+ all_acc=sum(all) / len(all) * 100)
@@ -0,0 +1,4 @@
1
+ from .legacy import (TheoremQA_postprocess, TheoremQA_postprocess_v2,
2
+ TheoremQADataset)
3
+ from .main import (TheoremQA_postprocess_v3, TheoremQADatasetV3,
4
+ TheoremQAEvaluatorV3)
@@ -4,7 +4,7 @@ from datasets import load_dataset
4
4
 
5
5
  from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
6
6
 
7
- from .base import BaseDataset
7
+ from ..base import BaseDataset
8
8
 
9
9
 
10
10
  @LOAD_DATASET.register_module()
@@ -24,3 +24,15 @@ def TheoremQA_postprocess(text: str) -> str:
24
24
  else:
25
25
  text = matches[0].strip().strip('.,?!\"\';:')
26
26
  return text
27
+
28
+
29
+ def TheoremQA_postprocess_v2(text: str) -> str:
30
+ prediction = text.strip().strip('\n').split('\n')[-1]
31
+ tmp = ''
32
+ for entry in prediction.split(' ')[::-1]:
33
+ if entry == 'is' or entry == 'be' or entry == 'are' or entry.endswith(
34
+ ':'):
35
+ break
36
+ tmp = entry + ' ' + tmp
37
+ prediction = tmp.strip().strip('.')
38
+ return prediction