opencompass 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (474) hide show
  1. {opencompass-0.2.5 → opencompass-0.2.6}/PKG-INFO +9 -1
  2. {opencompass-0.2.5 → opencompass-0.2.6}/README.md +8 -0
  3. opencompass-0.2.6/opencompass/__init__.py +1 -0
  4. opencompass-0.2.6/opencompass/cli/main.py +383 -0
  5. opencompass-0.2.6/opencompass/datasets/IFEval/__init__.py +0 -0
  6. opencompass-0.2.6/opencompass/datasets/IFEval/evaluation_main.py +141 -0
  7. opencompass-0.2.6/opencompass/datasets/IFEval/ifeval.py +95 -0
  8. opencompass-0.2.6/opencompass/datasets/IFEval/instructions.py +1570 -0
  9. opencompass-0.2.6/opencompass/datasets/IFEval/instructions_registry.py +190 -0
  10. opencompass-0.2.6/opencompass/datasets/IFEval/instructions_util.py +145 -0
  11. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/__init__.py +1 -0
  12. opencompass-0.2.6/opencompass/datasets/benbench.py +88 -0
  13. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/charm.py +1 -1
  14. opencompass-0.2.6/opencompass/datasets/humaneval.py +173 -0
  15. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/__init__.py +1 -0
  16. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/cjft.py +19 -0
  17. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/flzx.py +18 -0
  18. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ftcs.py +19 -0
  19. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/jdzy.py +36 -0
  20. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/jec_ac.py +29 -0
  21. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/jec_kd.py +29 -0
  22. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/jetq.py +43 -0
  23. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/lblj.py +29 -0
  24. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ljp_accusation.py +76 -0
  25. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py +70 -0
  26. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py +51 -0
  27. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/sjjc.py +64 -0
  28. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/wbfl.py +42 -0
  29. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/wsjd.py +52 -0
  30. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/xxcq.py +17 -0
  31. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ydlj.py +17 -0
  32. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/yqzy.py +18 -0
  33. opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/zxfl.py +27 -0
  34. opencompass-0.2.6/opencompass/datasets/lawbench/utils/__init__.py +1 -0
  35. opencompass-0.2.6/opencompass/datasets/lawbench/utils/char_smi.py +456 -0
  36. opencompass-0.2.6/opencompass/datasets/lawbench/utils/compare_m2_for_evaluation.py +433 -0
  37. opencompass-0.2.6/opencompass/datasets/lawbench/utils/comprehension_scores.py +82 -0
  38. opencompass-0.2.6/opencompass/datasets/lawbench/utils/function_utils.py +49 -0
  39. opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/__init__.py +1 -0
  40. opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/alignment.py +332 -0
  41. opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/annotator.py +76 -0
  42. opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/classifier.py +150 -0
  43. opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/merger.py +273 -0
  44. opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/tokenization.py +346 -0
  45. opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/tokenizer.py +91 -0
  46. opencompass-0.2.6/opencompass/datasets/lawbench/utils/parallel_to_m2.py +221 -0
  47. opencompass-0.2.6/opencompass/datasets/lawbench/utils/rc_f1.py +158 -0
  48. opencompass-0.2.6/opencompass/datasets/mathbench.py +381 -0
  49. opencompass-0.2.6/opencompass/datasets/mmlu_pro.py +31 -0
  50. opencompass-0.2.6/opencompass/datasets/needlebench/__init__.py +0 -0
  51. opencompass-0.2.6/opencompass/datasets/needlebench/atc.py +247 -0
  52. opencompass-0.2.6/opencompass/datasets/needlebench/atc_choice.py +169 -0
  53. opencompass-0.2.6/opencompass/datasets/needlebench/multi.py +257 -0
  54. opencompass-0.2.6/opencompass/datasets/needlebench/origin.py +277 -0
  55. opencompass-0.2.6/opencompass/datasets/needlebench/parallel.py +311 -0
  56. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/__init__.py +5 -0
  57. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/alignbench.py +3 -15
  58. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/arena_hard.py +1 -1
  59. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/compass_arena.py +1 -5
  60. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/compassbench.py +3 -2
  61. opencompass-0.2.6/opencompass/datasets/subjective/compassbench_control_length_bias.py +130 -0
  62. opencompass-0.2.6/opencompass/datasets/subjective/fofo.py +36 -0
  63. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/mtbench.py +7 -2
  64. opencompass-0.2.6/opencompass/datasets/subjective/mtbench101.py +325 -0
  65. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/subjective_cmp.py +1 -1
  66. opencompass-0.2.6/opencompass/datasets/subjective/wildbench.py +249 -0
  67. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/taco.py +2 -1
  68. opencompass-0.2.6/opencompass/datasets/teval/utils/__init__.py +0 -0
  69. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/__init__.py +2 -1
  70. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/ai360_api.py +10 -4
  71. opencompass-0.2.6/opencompass/models/baichuan_api.py +179 -0
  72. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/baidu_api.py +2 -0
  73. opencompass-0.2.5/opencompass/models/hunyuan_api.py → opencompass-0.2.6/opencompass/models/doubao.py +31 -42
  74. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/huggingface_above_v4_33.py +26 -19
  75. opencompass-0.2.6/opencompass/models/hunyuan_api.py +151 -0
  76. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/minimax_api.py +4 -1
  77. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/openai_api.py +9 -1
  78. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/sensetime_api.py +20 -11
  79. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/turbomind_with_tf_above_v4_33.py +13 -11
  80. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/vllm.py +16 -2
  81. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/vllm_with_tf_above_v4_33.py +11 -4
  82. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/xunfei_api.py +28 -21
  83. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/yi_api.py +5 -1
  84. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/lm_evaluator.py +45 -59
  85. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py +21 -0
  86. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/partitioners/num_worker.py +7 -2
  87. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/partitioners/sub_naive.py +74 -64
  88. opencompass-0.2.6/opencompass/partitioners/sub_num_worker.py +209 -0
  89. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/partitioners/sub_size.py +106 -87
  90. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/runners/dlc.py +6 -2
  91. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/runners/local.py +22 -10
  92. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/default.py +8 -3
  93. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/__init__.py +4 -1
  94. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/alignmentbench.py +36 -28
  95. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/alpacaeval.py +43 -27
  96. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/arenahard.py +52 -19
  97. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/compass_arena.py +20 -11
  98. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/compassbench.py +49 -42
  99. opencompass-0.2.6/opencompass/summarizers/subjective/fofo.py +164 -0
  100. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/mtbench.py +47 -44
  101. opencompass-0.2.6/opencompass/summarizers/subjective/mtbench101.py +147 -0
  102. opencompass-0.2.6/opencompass/summarizers/subjective/subjective.py +105 -0
  103. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/utils.py +5 -8
  104. opencompass-0.2.6/opencompass/summarizers/subjective/wildbench.py +295 -0
  105. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/tasks/openicl_infer.py +3 -4
  106. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/tasks/subjective_eval.py +10 -1
  107. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/prompt.py +13 -9
  108. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/run.py +17 -5
  109. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/text_postprocessors.py +1 -2
  110. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass.egg-info/PKG-INFO +9 -1
  111. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass.egg-info/SOURCES.txt +59 -1
  112. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass.egg-info/requires.txt +2 -2
  113. opencompass-0.2.5/opencompass/__init__.py +0 -1
  114. opencompass-0.2.5/opencompass/datasets/humaneval.py +0 -233
  115. opencompass-0.2.5/opencompass/datasets/mathbench.py +0 -106
  116. opencompass-0.2.5/opencompass/models/baichuan_api.py +0 -283
  117. opencompass-0.2.5/opencompass/summarizers/subjective/information_retrival.py +0 -138
  118. {opencompass-0.2.5/opencompass/datasets/teval/utils → opencompass-0.2.6/opencompass/cli}/__init__.py +0 -0
  119. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/FinanceIQ.py +0 -0
  120. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/GaokaoBench.py +0 -0
  121. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/MMLUArabic.py +0 -0
  122. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/__init__.py +0 -0
  123. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/cmp_GCP_D.py +0 -0
  124. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/cmp_KSP.py +0 -0
  125. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/cmp_TSP_D.py +0 -0
  126. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/hard_GCP.py +0 -0
  127. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/hard_MSP.py +0 -0
  128. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/hard_TSP.py +0 -0
  129. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/p_BSP.py +0 -0
  130. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/p_EDP.py +0 -0
  131. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/p_SPP.py +0 -0
  132. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/prompts.py +0 -0
  133. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/utils.py +0 -0
  134. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/OpenFinData.py +0 -0
  135. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/QuALITY.py +0 -0
  136. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/__init__.py +0 -0
  137. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/legacy.py +0 -0
  138. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/main.py +0 -0
  139. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/number_utils.py +0 -0
  140. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/utils.py +0 -0
  141. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/advglue.py +0 -0
  142. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/afqmcd.py +0 -0
  143. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/agieval/__init__.py +0 -0
  144. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/agieval/agieval.py +0 -0
  145. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/agieval/constructions.py +0 -0
  146. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/agieval/dataset_loader.py +0 -0
  147. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/agieval/evaluation.py +0 -0
  148. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/agieval/math_equivalence.py +0 -0
  149. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/agieval/post_process.py +0 -0
  150. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/agieval/utils.py +0 -0
  151. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/anli.py +0 -0
  152. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/anthropics_evals.py +0 -0
  153. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/apps.py +0 -0
  154. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/arc.py +0 -0
  155. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/ax.py +0 -0
  156. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/base.py +0 -0
  157. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/bbh.py +0 -0
  158. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/boolq.py +0 -0
  159. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/bustum.py +0 -0
  160. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/c3.py +0 -0
  161. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/cb.py +0 -0
  162. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/ceval.py +0 -0
  163. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/chembench.py +0 -0
  164. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/chid.py +0 -0
  165. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/cibench.py +0 -0
  166. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/circular.py +0 -0
  167. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/civilcomments.py +0 -0
  168. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/clozeTest_maxmin.py +0 -0
  169. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/cluewsc.py +0 -0
  170. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/cmb.py +0 -0
  171. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/cmmlu.py +0 -0
  172. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/cmnli.py +0 -0
  173. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/cmrc.py +0 -0
  174. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/commonsenseqa.py +0 -0
  175. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/commonsenseqa_cn.py +0 -0
  176. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/copa.py +0 -0
  177. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/crowspairs.py +0 -0
  178. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/crowspairs_cn.py +0 -0
  179. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/csl.py +0 -0
  180. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/custom.py +0 -0
  181. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/cvalues.py +0 -0
  182. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/drcd.py +0 -0
  183. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/drop.py +0 -0
  184. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/drop_simple_eval.py +0 -0
  185. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/ds1000.py +0 -0
  186. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/ds1000_interpreter.py +0 -0
  187. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/eprstmt.py +0 -0
  188. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/flames.py +0 -0
  189. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/flores.py +0 -0
  190. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/game24.py +0 -0
  191. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/govrepcrs.py +0 -0
  192. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/gpqa.py +0 -0
  193. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/gsm8k.py +0 -0
  194. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/gsm_hard.py +0 -0
  195. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/hellaswag.py +0 -0
  196. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/huggingface.py +0 -0
  197. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/humaneval_multi.py +0 -0
  198. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/humanevalx.py +0 -0
  199. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/hungarian_math.py +0 -0
  200. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/__init__.py +0 -0
  201. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_codedebug.py +0 -0
  202. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_coderun.py +0 -0
  203. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_endia.py +0 -0
  204. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_enmc.py +0 -0
  205. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_enqa.py +0 -0
  206. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_ensum.py +0 -0
  207. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_mathcalc.py +0 -0
  208. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_mathfind.py +0 -0
  209. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_retrievekv.py +0 -0
  210. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py +0 -0
  211. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py +0 -0
  212. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_zhqa.py +0 -0
  213. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/infinitebench/utils.py +0 -0
  214. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/iwslt2017.py +0 -0
  215. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/jigsawmultilingual.py +0 -0
  216. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/jsonl.py +0 -0
  217. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/kaoshi.py +0 -0
  218. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lambada.py +0 -0
  219. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lawbench/__init__.py +0 -0
  220. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lawbench/lawbench.py +0 -0
  221. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lcsts.py +0 -0
  222. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/__init__.py +0 -0
  223. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/evaluators.py +0 -0
  224. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_coursera.py +0 -0
  225. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_financial_qa.py +0 -0
  226. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_gov_report_summ.py +0 -0
  227. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_gsm100.py +0 -0
  228. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_legal_contract_qa.py +0 -0
  229. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_meeting_summ.py +0 -0
  230. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_multidoc_qa.py +0 -0
  231. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_narrattive_qa.py +0 -0
  232. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_natural_question.py +0 -0
  233. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_news_summ.py +0 -0
  234. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_paper_assistant.py +0 -0
  235. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_patent_summ.py +0 -0
  236. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_quality.py +0 -0
  237. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_review_summ.py +0 -0
  238. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_scientific_qa.py +0 -0
  239. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_topic_retrieval.py +0 -0
  240. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_tpo.py +0 -0
  241. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/leval/leval_tvshow_summ.py +0 -0
  242. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/llm_compression.py +0 -0
  243. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lmeval.py +0 -0
  244. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/__init__.py +0 -0
  245. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/evaluators.py +0 -0
  246. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_2wikim_qa.py +0 -0
  247. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_dureader.py +0 -0
  248. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_gov_report.py +0 -0
  249. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_hotpot_qa.py +0 -0
  250. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_lcc.py +0 -0
  251. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_lsht.py +0 -0
  252. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_multi_news.py +0 -0
  253. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_multifieldqa_en.py +0 -0
  254. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_multifieldqa_zh.py +0 -0
  255. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_musique.py +0 -0
  256. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_narrative_qa.py +0 -0
  257. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_passage_count.py +0 -0
  258. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_passage_retrieval_en.py +0 -0
  259. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py +0 -0
  260. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_qasper.py +0 -0
  261. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_qmsum.py +0 -0
  262. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_repobench.py +0 -0
  263. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_samsum.py +0 -0
  264. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_trec.py +0 -0
  265. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_trivia_qa.py +0 -0
  266. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_vcsum.py +0 -0
  267. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/__init__.py +0 -0
  268. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/evaluators.py +0 -0
  269. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_cmrc_mixup.py +0 -0
  270. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_dureader_mixup.py +0 -0
  271. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_factrecall_en.py +0 -0
  272. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_factrecall_zh.py +0 -0
  273. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py +0 -0
  274. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_lic_mixup.py +0 -0
  275. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py +0 -0
  276. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py +0 -0
  277. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py +0 -0
  278. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py +0 -0
  279. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py +0 -0
  280. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/mastermath2024v1.py +0 -0
  281. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/math.py +0 -0
  282. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/math401.py +0 -0
  283. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/math_intern.py +0 -0
  284. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/mbpp.py +0 -0
  285. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/medbench/__init__.py +0 -0
  286. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/medbench/constructions.py +0 -0
  287. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/medbench/dataset_loader.py +0 -0
  288. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/medbench/evaluation.py +0 -0
  289. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/medbench/math_equivalence.py +0 -0
  290. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/medbench/medbench.py +0 -0
  291. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/medbench/post_process.py +0 -0
  292. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/medbench/utils.py +0 -0
  293. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/mgsm.py +0 -0
  294. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/mmlu.py +0 -0
  295. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/multirc.py +0 -0
  296. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/narrativeqa.py +0 -0
  297. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/natural_question.py +0 -0
  298. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/natural_question_cn.py +0 -0
  299. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/obqa.py +0 -0
  300. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/piqa.py +0 -0
  301. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/py150.py +0 -0
  302. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/qasper.py +0 -0
  303. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/qaspercut.py +0 -0
  304. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/race.py +0 -0
  305. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/realtoxicprompts.py +0 -0
  306. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/reasonbench/ReasonBenchDataset.py +0 -0
  307. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/reasonbench/__init__.py +0 -0
  308. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/record.py +0 -0
  309. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/rolebench.py +0 -0
  310. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/s3eval.py +0 -0
  311. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/safety.py +0 -0
  312. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/scibench.py +0 -0
  313. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/siqa.py +0 -0
  314. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/squad20.py +0 -0
  315. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/storycloze.py +0 -0
  316. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/strategyqa.py +0 -0
  317. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/corev2.py +0 -0
  318. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/creationbench.py +0 -0
  319. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/information_retrival.py +0 -0
  320. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/subjective/multiround.py +0 -0
  321. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/summedits.py +0 -0
  322. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/summscreen.py +0 -0
  323. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/svamp.py +0 -0
  324. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/tabmwp.py +0 -0
  325. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/__init__.py +0 -0
  326. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/__init__.py +0 -0
  327. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/instruct_evaluator.py +0 -0
  328. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/planning_evaluator.py +0 -0
  329. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py +0 -0
  330. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/review_evaluator.py +0 -0
  331. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/schema.py +0 -0
  332. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/utils/convert_results.py +0 -0
  333. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/utils/format_load.py +0 -0
  334. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/utils/meta_template.py +0 -0
  335. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/teval/utils/template.py +0 -0
  336. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/tnews.py +0 -0
  337. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/triviaqa.py +0 -0
  338. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/triviaqarc.py +0 -0
  339. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/truthfulqa.py +0 -0
  340. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/tydiqa.py +0 -0
  341. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/wic.py +0 -0
  342. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/wikibench.py +0 -0
  343. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/winograd.py +0 -0
  344. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/winogrande.py +0 -0
  345. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/wnli.py +0 -0
  346. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/wsc.py +0 -0
  347. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/xcopa.py +0 -0
  348. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/xiezhi.py +0 -0
  349. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/xlsum.py +0 -0
  350. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/datasets/xsum.py +0 -0
  351. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/metrics/__init__.py +0 -0
  352. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/metrics/dump_results.py +0 -0
  353. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/metrics/mme_score.py +0 -0
  354. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/metrics/seedbench.py +0 -0
  355. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/accessory.py +0 -0
  356. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/alaya.py +0 -0
  357. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/base.py +0 -0
  358. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/base_api.py +0 -0
  359. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/bytedance_api.py +0 -0
  360. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/claude_api/__init__.py +0 -0
  361. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/claude_api/claude_api.py +0 -0
  362. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/claude_api/postprocessors.py +0 -0
  363. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/deepseek_api.py +0 -0
  364. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/gemini_api.py +0 -0
  365. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/glm.py +0 -0
  366. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/huggingface.py +0 -0
  367. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/intern_model.py +0 -0
  368. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/krgpt_api.py +0 -0
  369. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/lagent.py +0 -0
  370. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/langchain.py +0 -0
  371. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/lightllm_api.py +0 -0
  372. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/llama2.py +0 -0
  373. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/lmdeploy_pytorch.py +0 -0
  374. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/lmdeploy_tis.py +0 -0
  375. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/mistral_api.py +0 -0
  376. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/mixtral.py +0 -0
  377. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/modelscope.py +0 -0
  378. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/moonshot_api.py +0 -0
  379. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/nanbeige_api.py +0 -0
  380. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/pangu_api.py +0 -0
  381. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/qwen_api.py +0 -0
  382. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/stepfun_api.py +0 -0
  383. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/turbomind.py +0 -0
  384. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/turbomind_api.py +0 -0
  385. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/turbomind_tis.py +0 -0
  386. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/unigpt_api.py +0 -0
  387. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/yayi_api.py +0 -0
  388. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/zhipuai_api.py +0 -0
  389. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/models/zhipuai_v2_api.py +0 -0
  390. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/__init__.py +0 -0
  391. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_dataset_reader.py +0 -0
  392. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/__init__.py +0 -0
  393. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_agent_evaluator.py +0 -0
  394. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py +0 -0
  395. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +0 -0
  396. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_bpc_evaluator.py +0 -0
  397. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_circular_evaluator.py +0 -0
  398. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_em_evaluator.py +0 -0
  399. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +0 -0
  400. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_jieba_rouge_evaluator.py +0 -0
  401. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py +0 -0
  402. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py +0 -0
  403. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +0 -0
  404. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/__init__.py +0 -0
  405. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py +0 -0
  406. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_attack_inferencer.py +0 -0
  407. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +0 -0
  408. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +0 -0
  409. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py +0 -0
  410. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py +0 -0
  411. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_mink_percent_inferencer.py +0 -0
  412. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py +0 -0
  413. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py +0 -0
  414. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_sc_inferencer.py +0 -0
  415. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_sw_ce_loss_inferencer.py +0 -0
  416. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_tot_inferencer.py +0 -0
  417. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_prompt_template.py +0 -0
  418. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/__init__.py +0 -0
  419. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_base_retriever.py +0 -0
  420. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_bm25_retriever.py +0 -0
  421. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_dpp_retriever.py +0 -0
  422. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_fix_k_retriever.py +0 -0
  423. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_mdl_retriever.py +0 -0
  424. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_random_retriever.py +0 -0
  425. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_topk_retriever.py +0 -0
  426. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_votek_retriever.py +0 -0
  427. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_zero_retriever.py +0 -0
  428. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/utils/__init__.py +0 -0
  429. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/openicl/utils/logging.py +0 -0
  430. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/partitioners/__init__.py +0 -0
  431. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/partitioners/base.py +0 -0
  432. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/partitioners/naive.py +0 -0
  433. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/partitioners/size.py +0 -0
  434. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/registry.py +0 -0
  435. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/runners/__init__.py +0 -0
  436. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/runners/base.py +0 -0
  437. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/runners/local_api.py +0 -0
  438. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/runners/slurm.py +0 -0
  439. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/runners/slurm_sequential.py +0 -0
  440. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/__init__.py +0 -0
  441. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/circular.py +0 -0
  442. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/llm_compression.py +0 -0
  443. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/multi_faceted.py +0 -0
  444. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/multi_model.py +0 -0
  445. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/needlebench.py +0 -0
  446. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/all_obj.py +0 -0
  447. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/corev2.py +0 -0
  448. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/creationbench.py +0 -0
  449. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/flames.py +0 -0
  450. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/multiround.py +0 -0
  451. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/subjective/subjective_post_process.py +0 -0
  452. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/summarizers/summarizer_pretrain.py +0 -0
  453. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/tasks/__init__.py +0 -0
  454. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/tasks/base.py +0 -0
  455. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/tasks/llm_eval.py +0 -0
  456. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/tasks/openicl_attack.py +0 -0
  457. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/tasks/openicl_eval.py +0 -0
  458. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/__init__.py +0 -0
  459. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/abbr.py +0 -0
  460. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/auxiliary.py +0 -0
  461. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/build.py +0 -0
  462. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/collect_env.py +0 -0
  463. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/dependency.py +0 -0
  464. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/file.py +0 -0
  465. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/fileio.py +0 -0
  466. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/lark.py +0 -0
  467. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/logging.py +0 -0
  468. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/menu.py +0 -0
  469. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass/utils/types.py +0 -0
  470. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass.egg-info/dependency_links.txt +0 -0
  471. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass.egg-info/entry_points.txt +0 -0
  472. {opencompass-0.2.5 → opencompass-0.2.6}/opencompass.egg-info/top_level.txt +0 -0
  473. {opencompass-0.2.5 → opencompass-0.2.6}/setup.cfg +0 -0
  474. {opencompass-0.2.5 → opencompass-0.2.6}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opencompass
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: A comprehensive toolkit for large model evaluation
5
5
  Home-page: https://github.com/open-compass/opencompass
6
6
  Author: OpenCompass Contributors
@@ -78,6 +78,8 @@ Description: <div align="center">
78
78
 
79
79
  ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
80
80
 
81
+ - **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
82
+ - **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
81
83
  - **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
82
84
  - **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
83
85
  - **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
@@ -158,6 +160,12 @@ Description: <div align="center">
158
160
  python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
159
161
  ```
160
162
 
163
+ Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
164
+
165
+ ```bash
166
+ python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl -a lmdeploy
167
+ ```
168
+
161
169
  OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
162
170
 
163
171
  ```bash
@@ -70,6 +70,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
70
70
 
71
71
  ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
72
72
 
73
+ - **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
74
+ - **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
73
75
  - **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
74
76
  - **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
75
77
  - **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
@@ -150,6 +152,12 @@ After ensuring that OpenCompass is installed correctly according to the above st
150
152
  python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
151
153
  ```
152
154
 
155
+ Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
156
+
157
+ ```bash
158
+ python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl -a lmdeploy
159
+ ```
160
+
153
161
  OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
154
162
 
155
163
  ```bash
@@ -0,0 +1 @@
1
+ __version__ = '0.2.6'
@@ -0,0 +1,383 @@
1
+ # flake8: noqa
2
+ # yapf: disable
3
+ import argparse
4
+ import getpass
5
+ import os
6
+ import os.path as osp
7
+ from datetime import datetime
8
+
9
+ from mmengine.config import Config, DictAction
10
+
11
+ from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
12
+ from opencompass.runners import SlurmRunner
13
+ from opencompass.summarizers import DefaultSummarizer
14
+ from opencompass.utils import LarkReporter, get_logger
15
+ from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
16
+ get_config_from_arg)
17
+
18
+
19
+ def parse_args():
20
+ parser = argparse.ArgumentParser(description='Run an evaluation task')
21
+ parser.add_argument('config', nargs='?', help='Train config file path')
22
+
23
+ # add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
24
+ # if "infer" or "eval" not specified
25
+ launch_method = parser.add_mutually_exclusive_group()
26
+ launch_method.add_argument('--slurm',
27
+ action='store_true',
28
+ default=False,
29
+ help='Whether to force tasks to run with srun. '
30
+ 'If True, `--partition(-p)` must be set. '
31
+ 'Defaults to False')
32
+ launch_method.add_argument('--dlc',
33
+ action='store_true',
34
+ default=False,
35
+ help='Whether to force tasks to run on dlc. If '
36
+ 'True, `--aliyun-cfg` must be set. Defaults'
37
+ ' to False')
38
+ # Add shortcut parameters (models, datasets and summarizer)
39
+ parser.add_argument('--models', nargs='+', help='', default=None)
40
+ parser.add_argument('--datasets', nargs='+', help='', default=None)
41
+ parser.add_argument('--summarizer', help='', default=None)
42
+ # add general args
43
+ parser.add_argument('--debug',
44
+ help='Debug mode, in which scheduler will run tasks '
45
+ 'in the single process, and output will not be '
46
+ 'redirected to files',
47
+ action='store_true',
48
+ default=False)
49
+ parser.add_argument('--dry-run',
50
+ help='Dry run mode, in which the scheduler will not '
51
+ 'actually run the tasks, but only print the commands '
52
+ 'to run',
53
+ action='store_true',
54
+ default=False)
55
+ parser.add_argument(
56
+ '-a', '--accelerator',
57
+ help='Infer accelerator, support vllm and lmdeploy now.',
58
+ choices=['vllm', 'lmdeploy', None],
59
+ default=None,
60
+ type=str)
61
+ parser.add_argument('-m',
62
+ '--mode',
63
+ help='Running mode. You can choose "infer" if you '
64
+ 'only want the inference results, or "eval" if you '
65
+ 'already have the results and want to evaluate them, '
66
+ 'or "viz" if you want to visualize the results.',
67
+ choices=['all', 'infer', 'eval', 'viz'],
68
+ default='all',
69
+ type=str)
70
+ parser.add_argument('-r',
71
+ '--reuse',
72
+ nargs='?',
73
+ type=str,
74
+ const='latest',
75
+ help='Reuse previous outputs & results, and run any '
76
+ 'missing jobs presented in the config. If its '
77
+ 'argument is not specified, the latest results in '
78
+ 'the work_dir will be reused. The argument should '
79
+ 'also be a specific timestamp, e.g. 20230516_144254')
80
+ parser.add_argument('-w',
81
+ '--work-dir',
82
+ help='Work path, all the outputs will be '
83
+ 'saved in this path, including the slurm logs, '
84
+ 'the evaluation results, the summary results, etc.'
85
+ 'If not specified, the work_dir will be set to '
86
+ 'outputs/default.',
87
+ default=None,
88
+ type=str)
89
+ parser.add_argument(
90
+ '--config-dir',
91
+ default='configs',
92
+ help='Use the custom config directory instead of config/ to '
93
+ 'search the configs for datasets, models and summarizers',
94
+ type=str)
95
+ parser.add_argument('-l',
96
+ '--lark',
97
+ help='Report the running status to lark bot',
98
+ action='store_true',
99
+ default=False)
100
+ parser.add_argument('--max-num-workers',
101
+ help='Max number of workers to run in parallel. '
102
+ 'Will be overrideen by the "max_num_workers" argument '
103
+ 'in the config.',
104
+ type=int,
105
+ default=1)
106
+ parser.add_argument('--max-workers-per-gpu',
107
+ help='Max task to run in parallel on one GPU. '
108
+ 'It will only be used in the local runner.',
109
+ type=int,
110
+ default=1)
111
+ parser.add_argument(
112
+ '--retry',
113
+ help='Number of retries if the job failed when using slurm or dlc. '
114
+ 'Will be overrideen by the "retry" argument in the config.',
115
+ type=int,
116
+ default=2)
117
+ parser.add_argument(
118
+ '--dump-eval-details',
119
+ help='Whether to dump the evaluation details, including the '
120
+ 'correctness of each sample, bpb, etc.',
121
+ action='store_true',
122
+ )
123
+ # set srun args
124
+ slurm_parser = parser.add_argument_group('slurm_args')
125
+ parse_slurm_args(slurm_parser)
126
+ # set dlc args
127
+ dlc_parser = parser.add_argument_group('dlc_args')
128
+ parse_dlc_args(dlc_parser)
129
+ # set hf args
130
+ hf_parser = parser.add_argument_group('hf_args')
131
+ parse_hf_args(hf_parser)
132
+ # set custom dataset args
133
+ custom_dataset_parser = parser.add_argument_group('custom_dataset_args')
134
+ parse_custom_dataset_args(custom_dataset_parser)
135
+ args = parser.parse_args()
136
+ if args.slurm:
137
+ assert args.partition is not None, (
138
+ '--partition(-p) must be set if you want to use slurm')
139
+ if args.dlc:
140
+ assert os.path.exists(args.aliyun_cfg), (
141
+ 'When launching tasks using dlc, it needs to be configured '
142
+ 'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
143
+ ' to specify a new path.')
144
+ return args
145
+
146
+
147
+ def parse_slurm_args(slurm_parser):
148
+ """These args are all for slurm launch."""
149
+ slurm_parser.add_argument('-p',
150
+ '--partition',
151
+ help='Slurm partition name',
152
+ default=None,
153
+ type=str)
154
+ slurm_parser.add_argument('-q',
155
+ '--quotatype',
156
+ help='Slurm quota type',
157
+ default=None,
158
+ type=str)
159
+ slurm_parser.add_argument('--qos',
160
+ help='Slurm quality of service',
161
+ default=None,
162
+ type=str)
163
+
164
+
165
+ def parse_dlc_args(dlc_parser):
166
+ """These args are all for dlc launch."""
167
+ dlc_parser.add_argument('--aliyun-cfg',
168
+ help='The config path for aliyun config',
169
+ default='~/.aliyun.cfg',
170
+ type=str)
171
+
172
+
173
+
174
+
175
+ def parse_hf_args(hf_parser):
176
+ """These args are all for the quick construction of HuggingFace models."""
177
+ hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
178
+ hf_parser.add_argument('--hf-path', type=str, help='The path to the HuggingFace model, e.g. "facebook/opt-125m", required')
179
+ hf_parser.add_argument('--model-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the HuggingFace model')
180
+ hf_parser.add_argument('--tokenizer-path', type=str, help='The path to the HuggingFace tokenizer, same as --hf-path if not specified')
181
+ hf_parser.add_argument('--tokenizer-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the tokenizer')
182
+ hf_parser.add_argument('--peft-path', type=str, help='The path to the PEFT model')
183
+ hf_parser.add_argument('--peft-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the PEFT model')
184
+ hf_parser.add_argument('--generation-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the generation')
185
+ hf_parser.add_argument('--max-seq-len', type=int, help='The max sequence length for the HuggingFace model')
186
+ hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model')
187
+ hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model')
188
+ hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model')
189
+ hf_parser.add_argument('--num-gpus', type=int, default=None, help='Deprecated, please use --hf-num-gpus instead')
190
+ hf_parser.add_argument('--hf-num-gpus', type=int, default=1, help='The number of GPUs for the HuggingFace model passed via cli')
191
+ hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model')
192
+ hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model')
193
+
194
+
195
+ def parse_custom_dataset_args(custom_dataset_parser):
196
+ """These args are all for the quick construction of custom datasets."""
197
+ custom_dataset_parser.add_argument('--custom-dataset-path', type=str)
198
+ custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str)
199
+ custom_dataset_parser.add_argument('--custom-dataset-data-type',
200
+ type=str,
201
+ choices=['mcq', 'qa'])
202
+ custom_dataset_parser.add_argument('--custom-dataset-infer-method',
203
+ type=str,
204
+ choices=['gen', 'ppl'])
205
+
206
+
207
+ def main():
208
+ args = parse_args()
209
+
210
+ if args.num_gpus is not None:
211
+ raise ValueError('The `--num-gpus` argument is deprecated, please use '
212
+ '`--hf-num-gpus` to describe number of gpus used for '
213
+ 'the HuggingFace model instead.')
214
+
215
+ if args.dry_run:
216
+ args.debug = True
217
+ # initialize logger
218
+ logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
219
+
220
+ cfg = get_config_from_arg(args)
221
+ if args.work_dir is not None:
222
+ cfg['work_dir'] = args.work_dir
223
+ else:
224
+ cfg.setdefault('work_dir', os.path.join('outputs', 'default'))
225
+
226
+ # cfg_time_str defaults to the current time
227
+ cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
228
+ if args.reuse:
229
+ if args.reuse == 'latest':
230
+ if not os.path.exists(cfg.work_dir) or not os.listdir(
231
+ cfg.work_dir):
232
+ logger.warning('No previous results to reuse!')
233
+ else:
234
+ dirs = os.listdir(cfg.work_dir)
235
+ dir_time_str = sorted(dirs)[-1]
236
+ else:
237
+ dir_time_str = args.reuse
238
+ logger.info(f'Reusing experiements from {dir_time_str}')
239
+ elif args.mode in ['eval', 'viz']:
240
+ raise ValueError('You must specify -r or --reuse when running in eval '
241
+ 'or viz mode!')
242
+
243
+ # update "actual" work_dir
244
+ cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
245
+ current_workdir = cfg['work_dir']
246
+ logger.info(f'Current exp folder: {current_workdir}')
247
+
248
+ os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
249
+
250
+ # dump config
251
+ output_config_path = osp.join(cfg.work_dir, 'configs',
252
+ f'{cfg_time_str}_{os.getpid()}.py')
253
+ cfg.dump(output_config_path)
254
+ # Config is intentally reloaded here to avoid initialized
255
+ # types cannot be serialized
256
+ cfg = Config.fromfile(output_config_path, format_python_code=False)
257
+
258
+ # report to lark bot if specify --lark
259
+ if not args.lark:
260
+ cfg['lark_bot_url'] = None
261
+ elif cfg.get('lark_bot_url', None):
262
+ content = f'{getpass.getuser()}\'s task has been launched!'
263
+ LarkReporter(cfg['lark_bot_url']).post(content)
264
+
265
+ if args.mode in ['all', 'infer']:
266
+ # When user have specified --slurm or --dlc, or have not set
267
+ # "infer" in config, we will provide a default configuration
268
+ # for infer
269
+ if (args.dlc or args.slurm) and cfg.get('infer', None):
270
+ logger.warning('You have set "infer" in the config, but '
271
+ 'also specified --slurm or --dlc. '
272
+ 'The "infer" configuration will be overridden by '
273
+ 'your runtime arguments.')
274
+
275
+ if args.dlc or args.slurm or cfg.get('infer', None) is None:
276
+ fill_infer_cfg(cfg, args)
277
+
278
+ if args.partition is not None:
279
+ if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
280
+ cfg.infer.runner.partition = args.partition
281
+ cfg.infer.runner.quotatype = args.quotatype
282
+ else:
283
+ logger.warning('SlurmRunner is not used, so the partition '
284
+ 'argument is ignored.')
285
+ if args.debug:
286
+ cfg.infer.runner.debug = True
287
+ if args.lark:
288
+ cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
289
+ cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
290
+ 'predictions/')
291
+ partitioner = PARTITIONERS.build(cfg.infer.partitioner)
292
+ tasks = partitioner(cfg)
293
+ if args.dry_run:
294
+ return
295
+ runner = RUNNERS.build(cfg.infer.runner)
296
+ # Add extra attack config if exists
297
+ if hasattr(cfg, 'attack'):
298
+ for task in tasks:
299
+ cfg.attack.dataset = task.datasets[0][0].abbr
300
+ task.attack = cfg.attack
301
+ runner(tasks)
302
+
303
+ # evaluate
304
+ if args.mode in ['all', 'eval']:
305
+ # When user have specified --slurm or --dlc, or have not set
306
+ # "eval" in config, we will provide a default configuration
307
+ # for eval
308
+ if (args.dlc or args.slurm) and cfg.get('eval', None):
309
+ logger.warning('You have set "eval" in the config, but '
310
+ 'also specified --slurm or --dlc. '
311
+ 'The "eval" configuration will be overridden by '
312
+ 'your runtime arguments.')
313
+
314
+ if args.dlc or args.slurm or cfg.get('eval', None) is None:
315
+ fill_eval_cfg(cfg, args)
316
+ if args.dump_eval_details:
317
+ cfg.eval.runner.task.dump_details = True
318
+
319
+ if args.partition is not None:
320
+ if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner:
321
+ cfg.eval.runner.partition = args.partition
322
+ cfg.eval.runner.quotatype = args.quotatype
323
+ else:
324
+ logger.warning('SlurmRunner is not used, so the partition '
325
+ 'argument is ignored.')
326
+ if args.debug:
327
+ cfg.eval.runner.debug = True
328
+ if args.lark:
329
+ cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
330
+ cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
331
+ partitioner = PARTITIONERS.build(cfg.eval.partitioner)
332
+ tasks = partitioner(cfg)
333
+ if args.dry_run:
334
+ return
335
+ runner = RUNNERS.build(cfg.eval.runner)
336
+
337
+ # For meta-review-judge in subjective evaluation
338
+ if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
339
+ tasks[0], list):
340
+ for task_part in tasks:
341
+ runner(task_part)
342
+ else:
343
+ runner(tasks)
344
+
345
+ # visualize
346
+ if args.mode in ['all', 'eval', 'viz']:
347
+ summarizer_cfg = cfg.get('summarizer', {})
348
+
349
+ # For subjective summarizer
350
+ if summarizer_cfg.get('function', None):
351
+ main_summarizer_cfg = copy.deepcopy(summarizer_cfg)
352
+ grouped_datasets = {}
353
+ for dataset in cfg.datasets:
354
+ prefix = dataset['abbr'].split('_')[0]
355
+ if prefix not in grouped_datasets:
356
+ grouped_datasets[prefix] = []
357
+ grouped_datasets[prefix].append(dataset)
358
+ all_grouped_lists = []
359
+ for prefix in grouped_datasets:
360
+ all_grouped_lists.append(grouped_datasets[prefix])
361
+ dataset_score_container = []
362
+ for dataset in all_grouped_lists:
363
+ temp_cfg = copy.deepcopy(cfg)
364
+ temp_cfg.datasets = dataset
365
+ summarizer_cfg = dict(type=dataset[0]['summarizer']['type'], config=temp_cfg)
366
+ summarizer = build_from_cfg(summarizer_cfg)
367
+ dataset_score = summarizer.summarize(time_str=cfg_time_str)
368
+ if dataset_score:
369
+ dataset_score_container.append(dataset_score)
370
+ main_summarizer_cfg['config'] = cfg
371
+ main_summarizer = build_from_cfg(main_summarizer_cfg)
372
+ main_summarizer.summarize(time_str=cfg_time_str, subjective_scores=dataset_score_container)
373
+ else:
374
+ if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
375
+ summarizer_cfg['type'] = DefaultSummarizer
376
+ summarizer_cfg['config'] = cfg
377
+ summarizer = build_from_cfg(summarizer_cfg)
378
+ summarizer.summarize(time_str=cfg_time_str)
379
+
380
+
381
+
382
+ if __name__ == '__main__':
383
+ main()
@@ -0,0 +1,141 @@
1
+ # flake8: noqa
2
+ # yapf: disable
3
+
4
+ # Copyright 2023 The Google Research Authors.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import dataclasses
19
+ from typing import Dict, List, Optional, Union
20
+
21
+ from absl import flags
22
+
23
+ import opencompass.datasets.IFEval.instructions_registry as instructions_registry
24
+
25
+ _INPUT_DATA = flags.DEFINE_string('input_data',
26
+ None,
27
+ 'path to input data',
28
+ required=True)
29
+
30
+ _INPUT_RESPONSE_DATA = flags.DEFINE_string('input_response_data',
31
+ None,
32
+ 'path to input response data',
33
+ required=False)
34
+
35
+ _OUTPUT_DIR = flags.DEFINE_string(
36
+ 'output_dir',
37
+ None,
38
+ 'Output directory for inference and eval results.',
39
+ required=True,
40
+ )
41
+
42
+
43
+ @dataclasses.dataclass
44
+ class InputExample:
45
+ key: int
46
+ instruction_id_list: List[str]
47
+ prompt: str
48
+ kwargs: List[Dict[str, Optional[Union[str, int]]]]
49
+
50
+
51
+ @dataclasses.dataclass
52
+ class OutputExample:
53
+ instruction_id_list: List[str]
54
+ prompt: str
55
+ response: str
56
+ follow_all_instructions: bool
57
+ follow_instruction_list: List[bool]
58
+
59
+
60
+ def test_instruction_following_strict(
61
+ inp,
62
+ response,
63
+ ):
64
+ """Tests response to see if instrutions are followed."""
65
+ instruction_list = inp.instruction_id_list
66
+ is_following_list = []
67
+
68
+ for index, instruction_id in enumerate(instruction_list):
69
+ instruction_cls = instructions_registry.INSTRUCTION_DICT[
70
+ instruction_id]
71
+ instruction = instruction_cls(instruction_id)
72
+ instruction.build_description(**inp.kwargs[index])
73
+ args = instruction.get_instruction_args()
74
+ if args and 'prompt' in args:
75
+ instruction.build_description(prompt=inp.prompt)
76
+
77
+ if response.strip() and instruction.check_following(response):
78
+ is_following_list.append(True)
79
+ else:
80
+ is_following_list.append(False)
81
+
82
+ return OutputExample(
83
+ instruction_id_list=inp.instruction_id_list,
84
+ prompt=inp.prompt,
85
+ response=response,
86
+ follow_all_instructions=all(is_following_list),
87
+ follow_instruction_list=is_following_list,
88
+ )
89
+
90
+
91
+ def test_instruction_following_loose(
92
+ inp,
93
+ response,
94
+ ):
95
+ """Tests response for an upper bound for following instructions."""
96
+ r = response.split('\n')
97
+ response_remove_first = '\n'.join(r[1:]).strip()
98
+ response_remove_last = '\n'.join(r[:-1]).strip()
99
+ response_remove_both = '\n'.join(r[1:-1]).strip()
100
+ revised_response = response.replace('*', '')
101
+ revised_response_remove_first = response_remove_first.replace('*', '')
102
+ revised_response_remove_last = response_remove_last.replace('*', '')
103
+ revised_response_remove_both = response_remove_both.replace('*', '')
104
+ all_responses = [
105
+ response,
106
+ revised_response,
107
+ response_remove_first,
108
+ response_remove_last,
109
+ response_remove_both,
110
+ revised_response_remove_first,
111
+ revised_response_remove_last,
112
+ revised_response_remove_both,
113
+ ]
114
+ instruction_list = inp.instruction_id_list
115
+ is_following_list = []
116
+
117
+ for index, instruction_id in enumerate(instruction_list):
118
+ instruction_cls = instructions_registry.INSTRUCTION_DICT[
119
+ instruction_id]
120
+ instruction = instruction_cls(instruction_id)
121
+
122
+ instruction.build_description(**inp.kwargs[index])
123
+ args = instruction.get_instruction_args()
124
+ if args and 'prompt' in args:
125
+ instruction.build_description(prompt=inp.prompt)
126
+
127
+ is_following = False
128
+ for r in all_responses:
129
+ if r.strip() and instruction.check_following(r):
130
+ is_following = True
131
+ break
132
+
133
+ is_following_list.append(is_following)
134
+
135
+ return OutputExample(
136
+ instruction_id_list=inp.instruction_id_list,
137
+ prompt=inp.prompt,
138
+ response=response,
139
+ follow_all_instructions=all(is_following_list),
140
+ follow_instruction_list=is_following_list,
141
+ )