evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,285 @@
1
+ evalscope/__init__.py,sha256=RY0EjssSquqqsysRobElYm9Ix6E41uTXeaeh7lI7kqs,106
2
+ evalscope/arguments.py,sha256=nozBnog45l77jxTFH_lyyJkj04ER3yyIpICepc2tC1Y,3783
3
+ evalscope/config.py,sha256=KYS_O0RdAbruQhqP6mp3rQL0003Oaskx03IroZUGRps,6897
4
+ evalscope/constants.py,sha256=D2MU7bs_qwmcHQ1ge05C5Ekk04XqMyiGxssvKwAecxI,4515
5
+ evalscope/run.py,sha256=5cG81qfdpMN_GtPphvJ7BHboD6LBYHWyodX8ViR1XL4,8874
6
+ evalscope/run_arena.py,sha256=Kmzak4TGdATbOhOCe_zLLRxDvgtkOfs6e4VaxOAzPKk,8550
7
+ evalscope/summarizer.py,sha256=Eq7ZqGKuvrhWVeGriLxHCGupgnJmtvmIGqZYzRNaY8I,6480
8
+ evalscope/version.py,sha256=Xha7v5_YH0Oppyh6iO7HrpSsmv1WCPdQPFtzYTJvG4A,118
9
+ evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ evalscope/backend/base.py,sha256=l7zUHXX2XToIfU_hkVeTSHT9wWURYumyohXCIgywZBI,1021
11
+ evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
12
+ evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
13
+ evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-oyH3uMGL7M3nPp1WiHU,10381
14
+ evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
15
+ evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
16
+ evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
17
+ evalscope/backend/rag_eval/__init__.py,sha256=jFWj8l8bPAu1sz7wtX5gGIweBFC8c2LzXUPz7tGambE,284
18
+ evalscope/backend/rag_eval/backend_manager.py,sha256=Cw322R1j-L8vMERAWEXUTT-0a1K-V6KhQOtrOhgKVMM,2857
19
+ evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
20
+ evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
21
+ evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y0LNBjvttSXppc99gbz-f0TYQjnyLLyU,8347
22
+ evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=2NQRvlYY2SOzvOOj9WRLyxvRlyj8CAcgbQqgsv-Xjgw,3929
23
+ evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=CQnWZZTQ0FOzDtmGv7OF0W4Cv4g6u4_LQ93koDu1pes,2556
25
+ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=L0WYiy3Rgar0uMZRI-kz1qCEuUaFXwcsVj0CACG13ms,7439
26
+ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0Uq7W0sPDBJS1rqp70KgSfeRQ3c7u8YeGhj5Yiu6rk,5646
27
+ evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
28
+ evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
29
+ evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
30
+ evalscope/backend/rag_eval/cmteb/arguments.py,sha256=Z3GkGi7zjK85JynG-7CSVPmAxPRcGYuykkgfbxgn7_E,2317
31
+ evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
32
+ evalscope/backend/rag_eval/cmteb/task_template.py,sha256=FyFs1reefcsFCrWyi7Ya5dnFYvBhtxph2wIaFtOtFls,2595
33
+ evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
34
+ evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=-GTwORxILSkkXXGtTxuPTKSHNXQEllCRoUjuR7pnwFM,8962
35
+ evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=pbZBfjeVAKbjLy4tEk6KUVDv-Rv8HNHYWuNkfqf-Vwk,2025
36
+ evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=yISp67pXw4fSrsqTiYmfas6uPyqwE45L1c58Tpydc0E,4075
37
+ evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=AH7jwJ45WAVxVb60I2DTURVanIAbrlZzk-ey_dHWEO0,5491
38
+ evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9GUpuEEmcWwc78Q7ZJjRDZs,11454
39
+ evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=uhGLsQTo5lM3-L2Na3WJGqOLQw3c1WxHDA22ePJPxtU,12285
40
+ evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=PKBNyp45hIa3FYNA1psiwtwfwUcn7s9eNt6r5aUpyyY,1505
41
+ evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3HznsUUewdIAa_-LM,171
42
+ evalscope/backend/rag_eval/ragas/arguments.py,sha256=8SYCV15d25ocdDHRqmGMQzd9zR6gwfOrVSFBe4T-KCo,1806
43
+ evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
44
+ evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
45
+ evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=4wPfjNh-OVFQdvho3CAJ66_B2TZuRZVm6-xUIXokKcY,3935
46
+ evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=wWidnp8726hf6-fY31ZoqCt9zhZgVM260o8MwdBI0d8,1737
47
+ evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=o5RXPz-O1JM8gFRCLCY2iobh0uLc4mznT_zLCpWaPFE,968
48
+ evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=eEs6gdAKuYfDohCz9EzM1o0ykIEUbvwoRu1Pd2dL92E,3168
49
+ evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json,sha256=qZhHR9Ki374Ykb6V8dmptE1whXmPKRvAJ0Gl2akoaX0,216
50
+ evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=k5LjoxcIDM9Yvj0h5bje6ANXEOgFbioRs1i23259Md8,2486
51
+ evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=Pn1rGIjfyIeY6BZQEOeR4v-QC5xcmTN6aIh0G2E2Xuo,1740
52
+ evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=p7RrFdNWY1Wo5s03SvtXQSZ-CEn96NkPZ3EHsJ3UIFE,1137
53
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=s2mlf9BTWnmnCZ9H3yLZgPvPUPWnPgIIDtRtH0qStMM,991
54
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=s_6K6surhTGpr5efryHjW-PFDKlYJTTpgXDlC_TbzVw,1943
55
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
56
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=p-vCyibNNezGcuID2kGvBDZJGdPXm3NvTTVvH6ij7N4,1973
57
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
58
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=yayuzrNO2EO9eIqSv5mthNTVXnw_7D_HOJZ_tse-qw0,1374
59
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=-rOBZuhZGbVrlti3PycavxAoInEry3dMYt9VN3Qvo-E,1475
60
+ evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=svZ_xzfQp3KMzdVJoqTVPGnwgls2JjXXplTcUj1jVFo,767
61
+ evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=VRO9Hy-e5Dba1AkLqxj2R-Ezwoby3BvipM9zNlZJ4GY,1328
62
+ evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
63
+ evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=1YVcklCc4otS0mkO0aiNNFx7Zecc1L3wB6ol3NPxTt0,697
64
+ evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=c70_FGepQm3_dZngdjNudX_iCmu39tvZncyBqNxMrfg,658
65
+ evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
66
+ evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
67
+ evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
68
+ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=B5ZETlQw5XTEDnO-VR5yXjSbbg1eUtjGts7M5msK2ik,5618
69
+ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_YF82SXLpkxoJ4nUurmdKSEoJ-qsLY,2129
70
+ evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
+ evalscope/backend/rag_eval/utils/clip.py,sha256=WZovQJGyPI33Y-9bUnanR6fIYJzrXgnjD4zVwUJSgCw,5002
72
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=XWI07YeWDALc2etP4DGluYqrid85nKz1tjM91JLZRmM,6252
73
+ evalscope/backend/rag_eval/utils/llm.py,sha256=619eP8pXUcwIBaktBrGNA17j53j9jfg_1JeFDYzMCIE,2582
74
+ evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
75
+ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
76
+ evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
77
+ evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
78
+ evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
79
+ evalscope/benchmarks/benchmark.py,sha256=DnLgr__CzE4DICK3u3ZMeFY0sVktefmYh2Yql2swEhg,1796
80
+ evalscope/benchmarks/data_adapter.py,sha256=hSW-tyTXxUPS_FnsMYAxxw9e4N7jS5eLiBHgCFAQNeo,10287
81
+ evalscope/benchmarks/arc/__init__.py,sha256=9GBWGArac-s9igD8lnoEEKnpSQYNaHA8fVKonLimkrQ,360
82
+ evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
83
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=3q74hZB9G3X0-pQPzBk_a8wZIedmIlDHZBb4aUaBGRA,9197
84
+ evalscope/benchmarks/bbh/__init__.py,sha256=PcIMfTe4h5m-efBhnYQt6J-6O0qHFHGfuosRhk1Lhfo,303
85
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=UeNEEea5jqT7sYLpGGzvnxDdy6SrffM8H7gnVRpfGTw,10699
86
+ evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
87
+ evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
88
+ evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
89
+ evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt,sha256=N28JGB5_023fBzoo5HImvjz3A0zPZGoiTMPngQY8pNo,3568
90
+ evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt,sha256=pkUw0ezfvNgbYzUKCchJ6o7bg6UCVL_62LWqS6lKZY0,2405
91
+ evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt,sha256=LDRYUp6fGvG6K2l69VvDF5Szf-CUtgPqobWaQ3MHJ7A,4477
92
+ evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt,sha256=H4LkGVmx3U4F6vuqRYXKDpHOBCu7MYksLFWp1QfyDPk,4831
93
+ evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt,sha256=H8BorN-CyUrf0vrIANSgEILynJhpS02CiGjn-qad9NQ,3114
94
+ evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt,sha256=0e-abSxfoGJC8aYYtRCHlK_2UkiFwffSnv4iN7XXPLs,2505
95
+ evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt,sha256=0e-abSxfoGJC8aYYtRCHlK_2UkiFwffSnv4iN7XXPLs,2505
96
+ evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt,sha256=0e-abSxfoGJC8aYYtRCHlK_2UkiFwffSnv4iN7XXPLs,2505
97
+ evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt,sha256=Q4XZmrnTL1r8JCcB0mvJnb3oNUj45qjM-AfNK2ElWOQ,2121
98
+ evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt,sha256=YJ7chn5QrpNm8VHHqG2B7gYwBUt08fyT_qHhcc9KT-Y,2386
99
+ evalscope/benchmarks/bbh/cot_prompts/navigate.txt,sha256=n3Evl10cdk8VeMfZgUdu3knBH64LmLY5d4cQTnGMLuU,2147
100
+ evalscope/benchmarks/bbh/cot_prompts/object_counting.txt,sha256=SMQGqNi8JVCEVWcVVgQDedzKjslZSxHLcP68ECWX-Xc,1418
101
+ evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt,sha256=MlrdrvrBaUcW7VjWLLdN-O_yfwVFfYWHobGq099Cyhs,2386
102
+ evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt,sha256=jcL33cVyscRutNM793hWCryMBWQ-JFLip0DGM1UdAUc,2295
103
+ evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt,sha256=StrmTfLxHtvx4QM-zf1V2u8u1VQSxnZrI7Mwiizvjyw,3481
104
+ evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt,sha256=fJKB5sYhH0an4Oeqm53RDIu4mExZJVijCvGzje0nLz4,6141
105
+ evalscope/benchmarks/bbh/cot_prompts/snarks.txt,sha256=tvp4IAtaSNv8CKKeRx_G_PTVMICkenBNmMaq10SNXAE,3114
106
+ evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt,sha256=yRYmj1f0fwY8tiXTj_iiBYz5u4E4n7Sd3r0bJXHjSco,821
107
+ evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt,sha256=-qG7hItFjeahSB0EVvcikmLIR08P_fTIC-J38eV2fyk,3023
108
+ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
109
+ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
110
+ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
111
+ evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
112
+ evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
113
+ evalscope/benchmarks/ceval/__init__.py,sha256=vBN_OgmcvKglYIu96nRoT2wD8FDdM3cRoTB-dqlmbLg,393
114
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1J_WquXRPw-pRHBiYn7ZxRVSjjvWDqRUJLa8nvT1vYk,15050
115
+ evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
116
+ evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
117
+ evalscope/benchmarks/cmmlu/__init__.py,sha256=9M_Lo5-ePaD6hWG-Y-_i-U79yTOKadtHPG7zFvekwN4,393
118
+ evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
119
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=malBAKNtDbfJ-kJoQUQTYYQ18MTJST63bgcsLiiktlw,13956
120
+ evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
121
+ evalscope/benchmarks/competition_math/__init__.py,sha256=CDK03RXT-X21WcIAlkrCs0rCSiHe-yTY0nwM6-l75nI,465
122
+ evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
123
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=cHWJ6LLIWvftFXjGrOidMlZ1RGUFxPgDjs4wmBPSm1Y,18862
124
+ evalscope/benchmarks/general_qa/__init__.py,sha256=N2t-ehNrl9eVAarlSgJvRapm9yOjhfCWhNPPfcUUy-s,409
125
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=Y7_d6hmh94W2XbzUnDMX9_uKWcarK0zv4Q4mQWUfSZ8,5869
126
+ evalscope/benchmarks/gsm8k/__init__.py,sha256=CtcG_QM8m5zmvMs2N53d7kcm4_hIgsO2qYPyx-71aLw,313
127
+ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
128
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=gg65W_pz4mPOBUOwaYIgfUxGKzrmRZRuoEg5xtS8bYg,13830
129
+ evalscope/benchmarks/hellaswag/__init__.py,sha256=cY1kluaTqC7AvyzwlQYc3BF_kB3LD1gOpg6i7RDr0cI,415
130
+ evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
131
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=7REJeC8vD8OVtmcqI5TP6cTn88-KOzBs5oOKEZEmESs,8459
132
+ evalscope/benchmarks/humaneval/__init__.py,sha256=lqSlAf1-8Nzhc1j89sj6yAcaLt9pGhqu15M84bmzamc,333
133
+ evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
134
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=HxAjkIA-Wt5-wb8kNSDMzZRoHflgsNxIfa1BoeVzwog,1660
135
+ evalscope/benchmarks/mmlu/__init__.py,sha256=OGiN1J80WDM72y242o7diYT9Rl-jkVEqTNntCl8Vt4M,385
136
+ evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
137
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8T-fN_Az0gWOyME9nHl3MvcD144TjWknFKcEOMHppAI,15494
138
+ evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
139
+ evalscope/benchmarks/race/__init__.py,sha256=HVda-CB-Q-N8RbwiVLADXYNY6VLUH-frJ8VCc3jm0Mk,385
140
+ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
141
+ evalscope/benchmarks/race/race_adapter.py,sha256=Ppo7bttx15zB-m-UtguIwIXgqpEKAi_ClIOol0hPQiE,9805
142
+ evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
143
+ evalscope/benchmarks/trivia_qa/__init__.py,sha256=eLMVC6tfwty5HqrQuGyWeAF2IhRNajWoO1SkLVemQj4,409
144
+ evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
145
+ evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
146
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=YFatCVNM7I0YUttBznQMohmMkm3qxJpCSVxf6o_sgHk,7663
147
+ evalscope/benchmarks/truthful_qa/__init__.py,sha256=EZOaHn13NS3ddHpS62ija8jz71SxOOsqcQRVg69e_Ho,429
148
+ evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
149
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=XFnZXQJpHEe_xP_HImPHa8qrwojywnWAgeSaJAYB0oU,14916
150
+ evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
151
+ evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
152
+ evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
153
+ evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
154
+ evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
155
+ evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
156
+ evalscope/evaluator/__init__.py,sha256=h_EyZm7vDqBsGx6CkoQVLg0aMy0tE_IG5uEnheubb0s,174
157
+ evalscope/evaluator/evaluator.py,sha256=MGkuJi9o5Hdbj_fN7qolDqP0B47i9i0ksGd1uc-TMn0,18365
158
+ evalscope/evaluator/humaneval_evaluator.py,sha256=245XRxwulGQpjdapwU8CiYJn1xT0XKxl7hdWvzFxLG0,5964
159
+ evalscope/evaluator/rating_eval.py,sha256=VuDIZcmSlsv1tc8znDGesz8ZwpQ7NvZJPv823Quvht0,5566
160
+ evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
161
+ evalscope/evaluator/reviewer/auto_reviewer.py,sha256=YVTJAHK0uz9hNupsdeTXMM2PISECf8phXq0GYPr4law,16378
162
+ evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
163
+ evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
164
+ evalscope/metrics/math_accuracy.py,sha256=WqLfACuIeVFrX4q6_c2exnTLn2t10-rjv6sfxcqJJ14,1965
165
+ evalscope/metrics/metrics.py,sha256=9Qj2KuSmaLOPhpGdBfiKGKVTIxHCuk0CPKI2b6L1zb8,12589
166
+ evalscope/metrics/rouge_metric.py,sha256=oB-rBgMnavZSyOiAefg--OXdGfffKrET5bUmrx3nmx0,4408
167
+ evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
168
+ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=9YdE38duhBFsmFLkY7HXDCQqUNavB5Hh3kaB4WTjAII,11971
169
+ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
170
+ evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
171
+ evalscope/models/__init__.py,sha256=b-jXJ2Cj6dH8notAU7lvCVKbGrcEaf8Gfr5w79qNHAk,111
172
+ evalscope/models/dummy_chat_model.py,sha256=aG3yolnnIN_-gsfF9FsyjyGMewQteEnUfOxTGScROSE,1272
173
+ evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
174
+ evalscope/models/model_adapter.py,sha256=XBeSFTR9pXmnhFWRRddcobnITC5T4JKooeFUeWEtUVI,19006
175
+ evalscope/models/openai_model.py,sha256=-tPBu6v0Ogf_flmG88tFuu66QNKrOyxv3AjYwVtuR44,3313
176
+ evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
177
+ evalscope/models/api/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
178
+ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
179
+ evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
180
+ evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
+ evalscope/perf/arguments.py,sha256=_gW1tq7SbrAZd05N-FbY_oWrQB0Djs4KUaFdXSfFsr8,9112
182
+ evalscope/perf/benchmark.py,sha256=ff9PFFMY5UucuUihcdo6lSf1X9XXoaOmrpBvjDk5Mrw,9599
183
+ evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
184
+ evalscope/perf/main.py,sha256=-8NsvJZ7uyVfJT9N2lX36KfsHkVTy0r8OcsWPYoKms0,1316
185
+ evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
186
+ evalscope/perf/plugin/registry.py,sha256=PyK3E1AqQFuU4Bs9COvFFCJOaCtmHbfeQOVGtjVYh-I,1304
187
+ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
188
+ evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
189
+ evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
190
+ evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
191
+ evalscope/perf/plugin/api/openai_api.py,sha256=KRN6EjObTG08mcI82kJD3dGK7DoVMUZzrUZ1AgoLEp0,7007
192
+ evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
193
+ evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
194
+ evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
195
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=CGYtmRw71-ycJIObAHm2gmmJl_1MXPJOwmHV-0WS8DY,1581
196
+ evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
197
+ evalscope/perf/plugin/datasets/longalpaca.py,sha256=Yx5nxHGkmD4lJOJ-jcyqm2ZsGAxotJc77jUCkO1z0a4,1164
198
+ evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
199
+ evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
200
+ evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
+ evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0awWiXEIkCI,1212
202
+ evalscope/perf/utils/benchmark_util.py,sha256=xFZSSUoBoFpHRZC69-KS9cK2vqJlL7rIuCEz_MnpnGA,5564
203
+ evalscope/perf/utils/db_util.py,sha256=A2K3otCrNw3K1SMwoYo8a6jekT5nAVvWJepqi31DH28,7479
204
+ evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
205
+ evalscope/perf/utils/local_server.py,sha256=31EQZ8S_SzgSiBFpc9zRU13GXm2jREvRmPDN5qWKgbg,4468
206
+ evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
207
+ evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
208
+ evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
209
+ evalscope/registry/config/cfg_pairwise_baseline.yaml,sha256=d05pBiqOk1ejcdd9XE-opZ_ersyttAesF3Iwa2df8O8,3580
210
+ evalscope/registry/config/cfg_single.yaml,sha256=zjsUC3zhU8z7JURaJiz7npkUbFpP82q1ycqUmObC-hc,3056
211
+ evalscope/registry/data/question.jsonl,sha256=WQw5FXvFYerdfwPK1L4YwrWX-TApeAr2X4Zxjznq-oc,12885
212
+ evalscope/registry/data/prompt_template/lmsys_v2.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
213
+ evalscope/registry/data/prompt_template/prompt_templates.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
214
+ evalscope/registry/data/qa_browser/battle.jsonl,sha256=2MXcYoMItBmttQxSMh2Oa0x51xxqJaWEgSuERUx1O_0,1185590
215
+ evalscope/registry/data/qa_browser/category_mapping.yaml,sha256=3r9nUIciW9205qbtOQF7aI_etM191cM3vlWU8ueG2Co,484
216
+ evalscope/registry/tasks/arc.yaml,sha256=MghUuCmZPEwGqwYhA8ClRWHiSwC3kbHcKMRicQl9aqc,765
217
+ evalscope/registry/tasks/bbh.yaml,sha256=GE3PpE8zw_SROj41LZ5bTm6ZXXZjYOorAdwBCTEePXM,604
218
+ evalscope/registry/tasks/bbh_mini.yaml,sha256=8o9ZiWaCTkN2uTwiOhjBQuyKm7GUw6ZfUZxb2bkOmvs,678
219
+ evalscope/registry/tasks/ceval.yaml,sha256=XDaszb7DROKk8nQDiklirTvDJwkOUJtIN_tcUFVvIJk,703
220
+ evalscope/registry/tasks/ceval_mini.yaml,sha256=4aYW4c0IzgAXSs5dp4d8dJ0OHVp5sD4uiRjChjL1zZg,672
221
+ evalscope/registry/tasks/cmmlu.yaml,sha256=yOgKl1jmfcAfTuUcIMmG5SQhkrbEHEyyP3YuCuIN3l0,703
222
+ evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml,sha256=egdiM5oG7RSs0M-g8QNikwhJ9tZVgw5FiLy-rIYYHAA,737
223
+ evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNiAhGEdUqL-8c,702
224
+ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
225
+ evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
226
+ evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
227
+ evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
228
+ evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
229
+ evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
230
+ evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
231
+ evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
232
+ evalscope/third_party/longbench_write/eval.py,sha256=bZrpaKg9sPXv2VkUxLpfJiNqMIoIj7Pf3eFMqmDncyY,11229
233
+ evalscope/third_party/longbench_write/infer.py,sha256=bFsOp--8Qn6qQ-NpdLY0bennQGQl5TMGEngvGda8k7g,4937
234
+ evalscope/third_party/longbench_write/longbench_write.py,sha256=1caNiJvmZL2vwDU6oHUE4cdCViZGYE8yBo9EsMcA-Qw,3955
235
+ evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
236
+ evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
237
+ evalscope/third_party/longbench_write/resources/judge.txt,sha256=Go1ISY4bUBmEDXXY_DItjAmskuHSaRj5WTNMNH98FSk,1885
238
+ evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26ZSXzCTWWJTWXgFAYvOYupRuvdJUt_izOeSNOrV3k,54155
239
+ evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
240
+ evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
241
+ evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
242
+ evalscope/third_party/longbench_write/tools/data_etl.py,sha256=nmWKOrD-GeZi0ZGH5jLCGuW3qiLTui8ASSxI2z8l6ls,5962
243
+ evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
244
+ evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
245
+ evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
246
+ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2eexlehSi9LI4F3EPk-3JacrAb6ZoyxI,451
247
+ evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
248
+ evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
249
+ evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
250
+ evalscope/third_party/toolbench_static/toolbench_static.py,sha256=y4nC9WCBCgBg378aWYAdhmrFte_r_XOkigJs7XJ_iXQ,1930
251
+ evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
252
+ evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
253
+ evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
254
+ evalscope/tools/combine_reports.py,sha256=1BJ29IEUKoZLM3NAzg_IpU8B9uhljO9-b_hqAYi9RpA,5078
255
+ evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
256
+ evalscope/tools/rewrite_eval_results.py,sha256=2lbDHfF_9abK1tUk2UYZZRwzO68eoiE36dXyh_b-mwg,2011
257
+ evalscope/utils/__init__.py,sha256=hDS1xpoAxtVH4-ZQOXstdg7WYmjcGPQ62Kh54FIgkwU,87
258
+ evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
259
+ evalscope/utils/chat_service.py,sha256=N8lJPiVtzdqsHypa42wzb15T7hduXUrRPtU3Atf8yg4,8641
260
+ evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
261
+ evalscope/utils/logger.py,sha256=IkY0oxkWSvfA0z1m79crioTiqQcnxulNF5HtJNlV0Fc,3174
262
+ evalscope/utils/model_utils.py,sha256=zMS1YRu4CzU4CVLZS6e_lgfHIDBqv3YBTJbPF1R2M90,443
263
+ evalscope/utils/utils.py,sha256=PVtpv3WAIm6Bs66Vz4KBDiAiXp8y6Oejxxr1LWHTRsI,15146
264
+ tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
265
+ tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
266
+ tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
267
+ tests/cli/test_run.py,sha256=lXR35DDLQjdb-XGA6pKnQC9pJTfTOHjknAN7PEaw8G4,4334
268
+ tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
269
+ tests/perf/test_perf.py,sha256=GD5nInXpQG7H1E8wI6dvy4DFSvTEddGDzv-Cu8YV1ts,2995
270
+ tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
271
+ tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
272
+ tests/rag/test_mteb.py,sha256=CaEJ0f1M06Z90c72FQb9z23IC_KZtkURWsc_oRMgQn8,4609
273
+ tests/rag/test_ragas.py,sha256=N_mUBIyxdQ1REzjkoI2sBNluKLLmKatLc3VY1o9uPck,3947
274
+ tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
275
+ tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
276
+ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
277
+ tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
278
+ tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
279
+ tests/vlm/test_vlmeval.py,sha256=21xi0nu4ghDB6_X-Pol7pTfK7aYkAYOp82TQ-MSQv-I,1757
280
+ evalscope-0.8.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
281
+ evalscope-0.8.0.dist-info/METADATA,sha256=5RKZaNBwuJj84sdAXlNmT11Bm8kGYha6EYnqszwZ1Qk,23190
282
+ evalscope-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
283
+ evalscope-0.8.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
284
+ evalscope-0.8.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
285
+ evalscope-0.8.0.dist-info/RECORD,,
tests/cli/test_run.py CHANGED
@@ -1,17 +1,22 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
2
+ import os
3
3
  import subprocess
4
+ import torch
4
5
  import unittest
5
- from evalscope.utils import test_level_list, is_module_installed
6
+
7
+ from evalscope.run import run_task
8
+ from evalscope.utils import is_module_installed, test_level_list
6
9
  from evalscope.utils.logger import get_logger
7
10
 
11
+ os.environ['LOG_LEVEL'] = 'DEBUG'
12
+
8
13
  logger = get_logger()
9
14
 
10
15
 
11
16
  class TestRun(unittest.TestCase):
12
17
 
13
18
  def setUp(self) -> None:
14
- logger.info(f'Init env for evalscope native run UTs ...\n')
19
+ logger.info('Init env for evalscope native run UTs ...\n')
15
20
  self._check_env('evalscope')
16
21
 
17
22
  def tearDown(self) -> None:
@@ -26,14 +31,12 @@ class TestRun(unittest.TestCase):
26
31
 
27
32
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
28
33
  def test_run_simple_eval(self):
29
- model = 'ZhipuAI/chatglm3-6b'
30
- template_type = 'chatglm3'
34
+ model = 'qwen/Qwen2-0.5B-Instruct'
31
35
  datasets = 'arc' # arc ceval
32
- limit = 100
36
+ limit = 10
33
37
 
34
- cmd_simple = f'python3 -m evalscope.run ' \
38
+ cmd_simple = f'evalscope eval ' \
35
39
  f'--model {model} ' \
36
- f'--template-type {template_type} ' \
37
40
  f'--datasets {datasets} ' \
38
41
  f'--limit {limit}'
39
42
 
@@ -46,15 +49,13 @@ class TestRun(unittest.TestCase):
46
49
 
47
50
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
48
51
  def test_run_eval_with_args(self):
49
- model = 'ZhipuAI/chatglm3-6b'
50
- template_type = 'chatglm3'
51
- datasets = 'arc ceval' # arc ceval
52
+ model = 'qwen/Qwen2-0.5B-Instruct'
53
+ datasets = 'arc' # arc ceval
52
54
  limit = 5
53
55
  dataset_args = '{"ceval": {"few_shot_num": 0, "few_shot_random": false}}'
54
56
 
55
- cmd_with_args = f'python3 -m evalscope.run ' \
57
+ cmd_with_args = f'evalscope eval ' \
56
58
  f'--model {model} ' \
57
- f'--template-type {template_type} ' \
58
59
  f'--datasets {datasets} ' \
59
60
  f'--limit {limit} ' \
60
61
  f'--generation-config do_sample=false,temperature=0.0 ' \
@@ -68,9 +69,47 @@ class TestRun(unittest.TestCase):
68
69
  logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
69
70
 
70
71
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
71
- def test_run_eval_local(self):
72
- ...
72
+ def test_run_task(self):
73
+ task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['gsm8k'], 'limit': 2, 'debug': False}
74
+ run_task(task_cfg=task_cfg)
73
75
 
74
76
 
77
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
78
+ def test_run_custom_task(self):
79
+ from evalscope.config import TaskConfig
80
+
81
+ task_cfg = TaskConfig(
82
+ model='qwen/Qwen2-0.5B-Instruct',
83
+ datasets=['ceval'], # 数据格式,选择题格式固定为 'ceval'
84
+ dataset_args={
85
+ 'ceval': {
86
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
87
+ 'subset_list': [
88
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
89
+ ]
90
+ }
91
+ },
92
+ )
93
+ run_task(task_cfg=task_cfg)
94
+
95
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
96
+ def test_run_custom_qa(self):
97
+ from evalscope.config import TaskConfig
98
+
99
+ task_cfg = TaskConfig(
100
+ model='qwen/Qwen2-0.5B-Instruct',
101
+ datasets=['general_qa'], # 数据格式,选择题格式固定为 'ceval'
102
+ dataset_args={
103
+ 'general_qa': {
104
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
105
+ 'subset_list': [
106
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
107
+ ]
108
+ }
109
+ },
110
+ )
111
+
112
+ run_task(task_cfg=task_cfg)
113
+
75
114
  if __name__ == '__main__':
76
115
  unittest.main()
tests/perf/test_perf.py CHANGED
@@ -1,5 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os
3
+
4
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
3
5
  import unittest
4
6
 
5
7
  from evalscope.perf.main import run_perf_benchmark
@@ -49,6 +51,8 @@ class TestPerf(unittest.TestCase):
49
51
  'model': 'qwen2.5',
50
52
  'api': 'openai',
51
53
  'dataset': 'speed_benchmark',
54
+ 'min_tokens': 2048,
55
+ 'max_tokens': 2048,
52
56
  'debug': True,
53
57
  }
54
58
  run_perf_benchmark(task_cfg)
@@ -1,13 +1,13 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  import os
4
-
5
4
  # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
6
5
  import subprocess
7
6
  import unittest
8
- from evalscope.utils import test_level_list, is_module_installed
9
- from evalscope.utils.logger import get_logger
7
+
10
8
  from evalscope.run import run_task
9
+ from evalscope.utils import is_module_installed, test_level_list
10
+ from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
13
13
 
@@ -15,7 +15,7 @@ logger = get_logger()
15
15
  class TestCLIPBenchmark(unittest.TestCase):
16
16
 
17
17
  def setUp(self) -> None:
18
- self._check_env("webdataset")
18
+ self._check_env('webdataset')
19
19
 
20
20
  def tearDown(self) -> None:
21
21
  pass
@@ -23,57 +23,57 @@ class TestCLIPBenchmark(unittest.TestCase):
23
23
  @staticmethod
24
24
  def _check_env(module_name: str):
25
25
  if is_module_installed(module_name):
26
- logger.info(f"{module_name} is installed.")
26
+ logger.info(f'{module_name} is installed.')
27
27
  else:
28
- raise ModuleNotFoundError(f"run: pip install {module_name}")
28
+ raise ModuleNotFoundError(f'run: pip install {module_name}')
29
29
 
30
- @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
30
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
31
31
  def test_run_task(self):
32
32
  task_cfg = {
33
- "eval_backend": "RAGEval",
34
- "eval_config": {
35
- "tool": "clip_benchmark",
36
- "eval": {
37
- "models": [
33
+ 'eval_backend': 'RAGEval',
34
+ 'eval_config': {
35
+ 'tool': 'clip_benchmark',
36
+ 'eval': {
37
+ 'models': [
38
38
  {
39
- "model_name": "AI-ModelScope/chinese-clip-vit-large-patch14-336px",
39
+ 'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
40
40
  }
41
41
  ],
42
- "dataset_name": ["muge", "mnist"],
43
- "split": "test",
44
- "batch_size": 128,
45
- "num_workers": 1,
46
- "verbose": True,
47
- "skip_existing": False,
48
- "output_dir": "outputs",
49
- "cache_dir": "cache",
50
- "limit": 1000,
42
+ 'dataset_name': ['muge', 'mnist'],
43
+ 'split': 'test',
44
+ 'batch_size': 128,
45
+ 'num_workers': 1,
46
+ 'verbose': True,
47
+ 'skip_existing': False,
48
+ 'output_dir': 'outputs',
49
+ 'cache_dir': 'cache',
50
+ 'limit': 1000,
51
51
  },
52
52
  },
53
53
  }
54
54
 
55
55
  run_task(task_cfg)
56
56
 
57
- @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
57
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
58
58
  def test_run_custom(self):
59
59
  task_cfg = {
60
- "eval_backend": "RAGEval",
61
- "eval_config": {
62
- "tool": "clip_benchmark",
63
- "eval": {
64
- "models": [
60
+ 'eval_backend': 'RAGEval',
61
+ 'eval_config': {
62
+ 'tool': 'clip_benchmark',
63
+ 'eval': {
64
+ 'models': [
65
65
  {
66
- "model_name": "AI-ModelScope/chinese-clip-vit-large-patch14-336px",
66
+ 'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
67
67
  }
68
68
  ],
69
- "dataset_name": ["custom"],
70
- "data_dir": "custom_eval/multimodal/text-image-retrieval",
71
- "split": "test",
72
- "batch_size": 128,
73
- "num_workers": 1,
74
- "verbose": True,
75
- "skip_existing": False,
76
- "limit": 1000,
69
+ 'dataset_name': ['custom'],
70
+ 'data_dir': 'custom_eval/multimodal/text-image-retrieval',
71
+ 'split': 'test',
72
+ 'batch_size': 128,
73
+ 'num_workers': 1,
74
+ 'verbose': True,
75
+ 'skip_existing': False,
76
+ 'limit': 1000,
77
77
  },
78
78
  },
79
79
  }
@@ -81,5 +81,5 @@ class TestCLIPBenchmark(unittest.TestCase):
81
81
  run_task(task_cfg)
82
82
 
83
83
 
84
- if __name__ == "__main__":
84
+ if __name__ == '__main__':
85
85
  unittest.main(buffer=False)
tests/rag/test_mteb.py CHANGED
@@ -2,9 +2,10 @@
2
2
 
3
3
  import subprocess
4
4
  import unittest
5
- from evalscope.utils import test_level_list, is_module_installed
6
- from evalscope.utils.logger import get_logger
5
+
7
6
  from evalscope.run import run_task
7
+ from evalscope.utils import is_module_installed, test_level_list
8
+ from evalscope.utils.logger import get_logger
8
9
 
9
10
  logger = get_logger()
10
11
 
tests/rag/test_ragas.py CHANGED
@@ -1,9 +1,10 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os
3
3
  import unittest
4
- from evalscope.utils import test_level_list, is_module_installed
5
- from evalscope.utils.logger import get_logger
4
+
6
5
  from evalscope.run import run_task
6
+ from evalscope.utils import is_module_installed, test_level_list
7
+ from evalscope.utils.logger import get_logger
7
8
 
8
9
  logger = get_logger()
9
10
 
@@ -40,7 +41,6 @@ class TestRAGAS(unittest.TestCase):
40
41
  },
41
42
  'generator_llm': {
42
43
  'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
43
- 'template_type': 'qwen',
44
44
  },
45
45
  'embeddings': {
46
46
  'model_name_or_path': 'AI-ModelScope/m3e-base',
@@ -64,7 +64,6 @@ class TestRAGAS(unittest.TestCase):
64
64
  'testset_file': 'outputs/testset_chinese_with_answer.json',
65
65
  'critic_llm': {
66
66
  'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
67
- 'template_type': 'qwen',
68
67
  },
69
68
  'embeddings': {
70
69
  'model_name_or_path': 'AI-ModelScope/m3e-base',
@@ -90,7 +89,8 @@ class TestRAGAS(unittest.TestCase):
90
89
  'eval_config': {
91
90
  'tool': 'RAGAS',
92
91
  'eval': {
93
- 'testset_file': 'outputs/testset.json',
92
+ 'testset_file':
93
+ 'outputs/testset.json',
94
94
  'critic_llm': {
95
95
  'model_name': 'gpt-4o-mini', # 自定义聊天模型名称
96
96
  'api_base': 'http://127.0.0.1:8088/v1', # 自定义基础URL
@@ -1,13 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ import json
3
4
  import os
5
+ import requests
4
6
  import subprocess
5
7
  import time
6
8
  import unittest
7
9
 
8
- import json
9
- import requests
10
-
11
10
  from evalscope.backend.opencompass import OpenCompassBackendManager
12
11
  from evalscope.run import run_task
13
12
  from evalscope.summarizer import Summarizer