evalscope 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
  2. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  3. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  4. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  22. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  23. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  24. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
  25. evalscope/backend/rag_eval/utils/clip.py +149 -0
  26. evalscope/backend/rag_eval/utils/embedding.py +183 -0
  27. evalscope/backend/rag_eval/utils/llm.py +72 -0
  28. evalscope/backend/rag_eval/utils/tools.py +63 -0
  29. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  30. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  31. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  32. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  33. evalscope/benchmarks/race/samples.jsonl +5 -0
  34. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  35. evalscope/cli/start_perf.py +8 -11
  36. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  37. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  38. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  39. evalscope/metrics/rouge_metric.py +30 -15
  40. evalscope/perf/arguments.py +179 -0
  41. evalscope/perf/benchmark.py +245 -0
  42. evalscope/perf/http_client.py +127 -711
  43. evalscope/perf/main.py +35 -0
  44. evalscope/perf/plugin/__init__.py +2 -0
  45. evalscope/perf/plugin/api/__init__.py +3 -0
  46. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  47. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  48. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  49. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  50. evalscope/perf/plugin/datasets/__init__.py +6 -0
  51. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  52. evalscope/perf/plugin/datasets/custom.py +21 -0
  53. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  54. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  55. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  56. evalscope/perf/plugin/datasets/openqa.py +38 -0
  57. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  58. evalscope/perf/plugin/registry.py +54 -0
  59. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  60. evalscope/perf/utils/benchmark_util.py +135 -0
  61. evalscope/perf/utils/chat_service.py +252 -0
  62. evalscope/perf/utils/db_util.py +200 -0
  63. evalscope/perf/utils/handler.py +46 -0
  64. evalscope/perf/utils/local_server.py +139 -0
  65. evalscope/registry/config/cfg_arena.yaml +77 -0
  66. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  67. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  68. evalscope/registry/config/cfg_single.yaml +78 -0
  69. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  70. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  71. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  72. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  73. evalscope/registry/data/question.jsonl +80 -0
  74. evalscope/third_party/longbench_write/README.md +118 -0
  75. evalscope/third_party/longbench_write/default_task.json +27 -0
  76. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  77. evalscope/third_party/toolbench_static/README.md +118 -0
  78. evalscope/third_party/toolbench_static/config_default.json +15 -0
  79. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  80. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  81. evalscope/utils/logger.py +18 -20
  82. evalscope/utils/utils.py +41 -42
  83. evalscope/version.py +2 -2
  84. evalscope-0.7.0.dist-info/LICENSE +203 -0
  85. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/METADATA +162 -103
  86. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/RECORD +107 -32
  87. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
  88. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
  89. tests/cli/__init__.py +1 -0
  90. tests/cli/test_run.py +76 -0
  91. tests/perf/__init__.py +1 -0
  92. tests/perf/test_perf.py +96 -0
  93. tests/rag/__init__.py +0 -0
  94. tests/rag/test_clip_benchmark.py +85 -0
  95. tests/rag/test_mteb.py +136 -0
  96. tests/rag/test_ragas.py +120 -0
  97. tests/swift/__init__.py +1 -0
  98. tests/swift/test_run_swift_eval.py +146 -0
  99. tests/swift/test_run_swift_vlm_eval.py +128 -0
  100. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  101. tests/test_run_all.py +12 -0
  102. tests/vlm/__init__.py +1 -0
  103. tests/vlm/test_vlmeval.py +59 -0
  104. evalscope/perf/_logging.py +0 -32
  105. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  106. evalscope/perf/datasets/openqa.py +0 -22
  107. evalscope/perf/plugin_registry.py +0 -35
  108. evalscope/perf/query_parameters.py +0 -42
  109. evalscope/perf/server_sent_event.py +0 -43
  110. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  111. /evalscope/{perf/datasets → backend/rag_eval/utils}/__init__.py +0 -0
  112. /evalscope/{preprocess/tokenizers → perf/utils}/__init__.py +0 -0
  113. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
  114. {evalscope/preprocess → tests}/__init__.py +0 -0
@@ -6,7 +6,7 @@ evalscope/run.py,sha256=uAXtaxIBcR94jyfHGFAecuzn0y71oLgu-d9VOohCJAw,18738
6
6
  evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
7
7
  evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
8
8
  evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
9
- evalscope/version.py,sha256=HbExGw191bJuKShYz5RiaxbmdfvIqJQ_bjIjXZhfMDw,121
9
+ evalscope/version.py,sha256=KtYzabHQSFovvIs99J7glrRj7yrdxYKzKpbzzM2lORk,118
10
10
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
12
12
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -14,7 +14,7 @@ evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG
14
14
  evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZcngqD8rxvI5EijLUh_zI,10294
15
15
  evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
16
16
  evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
17
- evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=t2t3_dHZf-eMfNqpQaD2XIjWZejTN4AxVXITdj_4Y3o,5324
17
+ evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=3V67A2LSj_XaiGd9fqdKpxpzyNrfynCH3UnhaBtAaqc,5326
18
18
  evalscope/backend/rag_eval/__init__.py,sha256=8om6TVnTMmyTEQt1jBuUQA4UfIzyps-_-ih90H_Qjio,284
19
19
  evalscope/backend/rag_eval/backend_manager.py,sha256=jmO-UMu6_iOXMnl4--PrMWCsnIYEhsbiX017rtURqm0,2997
20
20
  evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=gDXCiRUTSeGQHxd5SjQsnphMqHJ2si2jywRiHvujEOg,150
@@ -25,6 +25,8 @@ evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-
25
25
  evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=Bj2ysvM0JT-6T40v0rffeZgJIRht5KVX0GzMOiUphf0,2578
26
26
  evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=ZrUYDbQ75eo0vmIwXh5Bb9c4nyEwd4AO2oURaIqjIII,7502
27
27
  evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=Bcs64xece4BMNhxuaFimOwMJnlpjNxfGrdSCWOYItko,5977
28
+ evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=3wW-AigMx5rygsI47rr8Kym_t0GWO4eio7zSAavSr6A,8765
29
+ evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
28
30
  evalscope/backend/rag_eval/cmteb/__init__.py,sha256=ajVz6XP5hqPq-jm66hp2poA2qKj1V19ZGoqjrGUlO7U,279
29
31
  evalscope/backend/rag_eval/cmteb/arguments.py,sha256=wZvnVir2tSxYCV_DPR3TSDj4VxtUn3wLhBPqyMJYKno,2330
30
32
  evalscope/backend/rag_eval/cmteb/base.py,sha256=fYrIjKwOLwBAHb2rlNkEjYScjZ5Qpyv2LdMmWZYWREA,2830
@@ -43,11 +45,36 @@ evalscope/backend/rag_eval/ragas/task_template.py,sha256=nv2i9-NE2SXpLrVKo5zhadY
43
45
  evalscope/backend/rag_eval/ragas/metrics/__init__.py,sha256=HgY5nrcNtWpQ7gBi5lCEJXJVINd_R57dsmI8ldS2rd0,160
44
46
  evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py,sha256=Uqz5qWZ76Gos95_QlhwncbATXyk0YX4wkI0LiAdPElU,3838
45
47
  evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py,sha256=CdLnWHq1eTna6j3F5-pncW5YusxD_v3ScjzeCsZ7mng,3967
48
+ evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=1m8FBVga_uetCkahL_mwhGS8nAXG8V4jmnT4iP_6QYo,794
49
+ evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=YaqCbIynnRtPQHng6AzlD4l7KA-TPAi4ayjnhZj6gw0,3940
50
+ evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=-BjIwLy3QOiQbFGqjhYTNfhLTLeaBeOtpKBKfpjlf7E,1736
51
+ evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=eyUasvFvtwXAcpeUaOOBVuvxhGl-u_dndV-qsjnqsF4,981
52
+ evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=KXr3hmd49n1KsgYWrjTuYY9xBFIcTSksueVTUEwfEm0,3188
53
+ evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=1A9KlwbQr8WqNxdLEa4nU1HlPzF-q2KflQ591pJA0To,2475
54
+ evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=YQFk8o0esRyOF9m2aJBR_Nwn40D6LAr7YrfhQdHae_s,1739
55
+ evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=xH4kduv1OUJIl_xcGGh-StK_zOlZa4G-pCrIt1M8Hbc,1025
56
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=Q4sf2Xud4NpVrbEIYZJEE_VVjMy-fgwX_AK0OnMQpDg,992
57
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=FGGqRlNgvEXnH-YcNPk5pzoRZXwtaS5cMtbIBQyEPyU,669
58
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=4JTUCczH-7UjH5nlz13w-srcTC3usqiXjJwLwxu-MIg,919
59
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=PJ2IHm3zXHe_XnT_DPxL5TNqJGJ-jjX2owVShw9V9kA,672
60
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=4JTUCczH-7UjH5nlz13w-srcTC3usqiXjJwLwxu-MIg,919
61
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=nZ7VIz6R1XyyKtP0Vq5jPFNfHaN6M1Z9rFPOCVRChBE,1374
62
+ evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=5IKDA_hPmyuDXMhzK7aACrZGrYNT3wuqhzsHYC7Vkt4,1496
63
+ evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=uY_4P9OloNHP2IdvIuoTFCuUEHMyEqx9TzCoC6tj8G8,774
64
+ evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=-0BwNQgPeH3dIIHsgNSL9OCMsg03oqtWtqm6HJG6gOk,663
65
+ evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=9oSmEYvqor920jXByeNynyOSXagAukFK_e4jnMuDZQU,916
66
+ evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=ukF4AaOn8Su0uZ5E_uszzZFC1_MY2M9OymOSZ15w0BQ,688
67
+ evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=dH-etTJrQ0gQIS97QCZ5IhQR223gLS0_QZjUEW91fOA,657
46
68
  evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=WO2xja0g0JSiYGdu2uAEDQgDceuFcgPWwPoqFnwDU0s,172
47
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=In-2VvZJIZvXl9idGUUQBTb7Gu-o1yFLjaqj-eJkWw0,8437
69
+ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=nX-dG0Fm1629pSASujuEmMODFZf1955WncNNykRrNtI,9305
48
70
  evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=bXOqik6qKWzbrEz21ykdkqeqqPrmoUIhTwW6eRQXy0M,2222
71
+ evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
+ evalscope/backend/rag_eval/utils/clip.py,sha256=frafvJ1soUtjFUmi-053_Fhg6ERRwyvczQBlLWAX9vE,5104
73
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=RZf0JlovZY_cCBsq8MMUqC_Sy78WtKLY_rBAlRA_udo,6239
74
+ evalscope/backend/rag_eval/utils/llm.py,sha256=9tFwMNoTf3jNomgDu5qqVLO92HtEtelH3DXpny9_B2g,2552
75
+ evalscope/backend/rag_eval/utils/tools.py,sha256=LpcYoeIBj1btzQ1_P84u1dYCdRWhMtiltxihmZCvWKk,1528
49
76
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
50
- evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ewhpE9yzsqf5ED6kqsqek2YEgg96GBQOupxtVNhaXxI,6046
77
+ evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
51
78
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=Yz2A5kB1E8DYBnjuVCA6TTPtLjhg8vYKeJTh6FU_Ecw,1645
52
79
  evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
53
80
  evalscope/benchmarks/benchmark.py,sha256=EmwYyFdrAHBGMkSbsMZQOR_62Q0CSKl8zeLlr7xvJdQ,2159
@@ -87,9 +114,11 @@ evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=uhRRz8y0hfHI96olJS9
87
114
  evalscope/benchmarks/ceval/__init__.py,sha256=SatTco8Ks6wD0jh9LUN5chf21VaJnwW1SG4cGG8OYAo,343
88
115
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=FBUTdmW4a5TY7atBjE_H1h_ST2_WoPWMMTvfHNvusNU,15852
89
116
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=S32eMfGUBMrUDP39HzO6XfvSir0tthHCPItNtriE-hc,5063
117
+ evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
90
118
  evalscope/benchmarks/cmmlu/__init__.py,sha256=mIMlXA_BHb_bF71Oi5XJwhV_sZKN2b_lBTOXhU5h6Bg,342
91
119
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=q_6ONrjdcHNqpXTUmSVbNOfl1yMd0zEQZWnh0PMQmYY,5153
92
120
  evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=jqVghYwex2Awx7THgka0wQ7dFY0EdzfnI7n0aMXGPro,15216
121
+ evalscope/benchmarks/cmmlu/samples.jsonl,sha256=l842nKaAfeRE69jcX_E5N1gstWrHYpoNZjP-5D6Aq_k,1721
93
122
  evalscope/benchmarks/competition_math/__init__.py,sha256=hXO0DTtrA_0YDYUcyrL4XOyPGvPEa0sy2miHTF1Cxrg,393
94
123
  evalscope/benchmarks/competition_math/competition_math.py,sha256=0p5iKUfU6WpXgplb44YgVWZUYkeWLLmOdj66_dapdDc,2678
95
124
  evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=FijGL1FlEWJAy34tp3bIapiglT7KBJ8AvU8bjP4CGAw,19087
@@ -107,10 +136,13 @@ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=1YiAvNbWRUcaTu9oGwpDM
107
136
  evalscope/benchmarks/mmlu/__init__.py,sha256=fZicGcLq67XOc5cofGCi6WrV4FdubLupKb7nMdCUQSA,337
108
137
  evalscope/benchmarks/mmlu/mmlu.py,sha256=GhjZFOgX5qG041eVrSWggOcRcMyl0oAI_yGXmufwEzc,5256
109
138
  evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=9lg_3s3QjGKC794O-RogU9cdvcCP7_Vp4ve9U9dRhz8,16401
139
+ evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
110
140
  evalscope/benchmarks/race/__init__.py,sha256=htMZhgk40CsvNF7HXaHeAejUnGbUtU6Nu2yATOiMfaU,337
111
141
  evalscope/benchmarks/race/race.py,sha256=giY44Vr6CePdVQxpi0x4CLsaknye47Gdlc_PVqN9VCA,3835
112
142
  evalscope/benchmarks/race/race_adapter.py,sha256=3zHfz3tFzCVKoYLtzpGek338ZnIGT7ejq_xSaMxiIjU,9900
143
+ evalscope/benchmarks/race/samples.jsonl,sha256=GMwF5IPRWrsq6cfYNGS5yt_woXz687HObA0IkB6k3V4,1242
113
144
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=oslov-n_oV3bhEhrPXLJoQwmHE8_vYR2JTerxoHq29A,351
145
+ evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=9OsKC9uuBbw9MHghOOMMALeGkFOY_QTNWZYAr0ASPQ0,3444
114
146
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=xrebA71r_Ek9NvwkDfsmWTuRCsae2HZEGmTBtZMGwfM,3296
115
147
  evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=0g0xSWorXiHrZ3PKTqOO6g18kK2tUop1HWaAjmCKRwg,7659
116
148
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=4bRdnHOceaEvn20jZj0yLCg5wpOHpzP3LRjkYm5u-Fs,367
@@ -119,7 +151,7 @@ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=Cavimjnc6NPMC1TDO
119
151
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
120
152
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
121
153
  evalscope/cli/cli.py,sha256=uZ-qC8WBsLd5-Hn94d43sSGg0UC_12RebSD4ToKjypg,844
122
- evalscope/cli/start_perf.py,sha256=TL6bMXYl3ln-tfs5uBmzb9x94uxz6f3PBFIt1l7g3VA,994
154
+ evalscope/cli/start_perf.py,sha256=yIE3sP13_yoTXQD3DBNzRVY6L_5p-Ix0J1VBvZFYdVU,914
123
155
  evalscope/cli/start_server.py,sha256=ATGLP2TE0aImJNicpehdzBuFlNb50F7KhyL4A_ZSoGU,3885
124
156
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
125
157
  evalscope/evaluator/evaluator.py,sha256=eSCgPPDGfIJfKu0cthhbDLFm1xMhj_869iT3ngcQkPc,30817
@@ -130,9 +162,11 @@ evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw
130
162
  evalscope/metrics/code_metric.py,sha256=zK1tpNDZbvmSHt3a_JJ5Y2Hdu2cqeFriy__wUOl2tSw,3462
131
163
  evalscope/metrics/math_accuracy.py,sha256=1PCy1VUNYg48JcGy-6SUmUDZNwPeAkMW1QQ_lXomdWw,1988
132
164
  evalscope/metrics/metrics.py,sha256=sDZljGiZwgHsFZ5eNi65-3z3BLCdIwWUzPcq2QpKf1k,12545
133
- evalscope/metrics/rouge_metric.py,sha256=sN0r-sXXc-nJUdFrthQPAv1VFdOCrF6zzIYDKaLSgrU,4522
165
+ evalscope/metrics/rouge_metric.py,sha256=VNdy86ZGZL6thVDFg0nKedp6dPApV7_yoIupMe0f6hk,4518
134
166
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
135
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=vhzIMSQezhZuJzGndymWjB_iRbDdECoEidOIdNL3NAM,12213
167
+ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=MXcHwmsXnh9mQZR1Bt5St6DNwXY-mfz4dNM8y6a23dc,12236
168
+ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
169
+ evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
136
170
  evalscope/models/__init__.py,sha256=zG27J2HSeKPGiAIUE7QLPHEPLyXLsfaDwYI_TDXjpCg,145
137
171
  evalscope/models/dummy_chat_model.py,sha256=xE8wcFVSCkvizEJ-B8ojX0Ir01Q5KrN5mapjMQaQtbg,1325
138
172
  evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
@@ -144,25 +178,42 @@ evalscope/models/api/openai_api.py,sha256=uBicJPaFLOhIrB5PKI8FE-SItb7v-fuDwBgkgn
144
178
  evalscope/models/custom/__init__.py,sha256=K4Ewo7Qrs73-jBuPq4ffxd8hMnttKhic-Zj0amH3wiU,103
145
179
  evalscope/models/custom/custom_model.py,sha256=2ivxfGQs5V5HDnQEhTBi5v8KNBxJDbzPVJdNOGo3iSg,1566
146
180
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
- evalscope/perf/_logging.py,sha256=v-a1uhqUt8116OEGXa-uhCPNE3mLxBaJZaKo2ReJgp8,1034
148
- evalscope/perf/api_plugin_base.py,sha256=ieAE-WjJLfgKIz0lDv1TkoKU3oPAW4pMseOJLmuHxCo,2243
149
- evalscope/perf/custom_api.py,sha256=H2IgM-LMjqXxVhbrtkXuiREb-p14zwMmllgl26a-jgw,3712
150
- evalscope/perf/dashscope_api.py,sha256=_XUF3czkYdPdVgtP7nqzRxROKxlqDjWs4DQnTyocNvM,3410
151
- evalscope/perf/dataset_plugin_base.py,sha256=6veUTyZ38W1Iig65vxNV9SfmqrsR8ID_UHgNiUO9Bv4,1814
152
- evalscope/perf/how_to_analysis_result.py,sha256=UVd_aYJ_7N5hl_wK9oIZig1vSwfgzodxW7XC6IWqbdg,1044
153
- evalscope/perf/http_client.py,sha256=4ppaZAIwrajJ9nzdgdwc3EdjmGSJz1_dg7Q6wQYELgw,34537
154
- evalscope/perf/openai_api.py,sha256=rJSGlXtnHgMNYcgO0bJQCsSLhKChUxklTk4cI63YTMQ,6066
155
- evalscope/perf/plugin_registry.py,sha256=D2MG2AXDBScjuKxB4g_Hg026pSRO752dBimonYtaAzM,782
156
- evalscope/perf/query_parameters.py,sha256=HfGRZJSzRMVfPezWTvbWhYeprCetGNPX_M_paoDtuOY,1346
157
- evalscope/perf/server_sent_event.py,sha256=s2UqUr1qAMWzBG1XWCFxhulyztd6FM0tGqVvPC8jD5o,1153
158
- evalscope/perf/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
159
- evalscope/perf/datasets/line_by_line.py,sha256=vbBNh0GcR-BfbFZMT6Z_3NqXe4y-uVfyaoooBRE7gjc,830
160
- evalscope/perf/datasets/longalpaca_12k.py,sha256=OaOzksyBBbeYwO0tFnKZ6UZ9PQO2RdMRD4HyCVBxnX4,934
161
- evalscope/perf/datasets/openqa.py,sha256=Dz5__mcYjP81Mc2NCGDAy-JFTVvif1slP7iWQflayFY,1018
162
- evalscope/preprocess/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
163
- evalscope/preprocess/tokenizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
164
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py,sha256=8dOPVWrzAXhzmzSKBWdWjfDqPlRaMH9slK8v5aWhwcQ,7810
181
+ evalscope/perf/arguments.py,sha256=ixiWx16qAL1gU7JTwoYOnvvc3IrwVWGz2uVno38gywA,8671
182
+ evalscope/perf/benchmark.py,sha256=Yiqcg5N03KmBa-5aWYNyklbYJ9Hqiuu1oaD8kBkFPSQ,9659
183
+ evalscope/perf/http_client.py,sha256=OpTgYl4obSpmyi5bOkTRSIQxp0aVdO08EcIVFAv-znU,7192
184
+ evalscope/perf/main.py,sha256=ljJDJVsD9hGWgF5bJCW-mfUGohc4LofaxiyAUfMa2WQ,997
185
+ evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
186
+ evalscope/perf/plugin/registry.py,sha256=PyK3E1AqQFuU4Bs9COvFFCJOaCtmHbfeQOVGtjVYh-I,1304
187
+ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
188
+ evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
189
+ evalscope/perf/plugin/api/custom_api.py,sha256=NQ2LDKsFQfExVRx2prcmfORCBzxxibfhpVHhB-lxAO4,3776
190
+ evalscope/perf/plugin/api/dashscope_api.py,sha256=0p9f6ujppS_H6w7wsIbRVNnCkHXtRemIai5Bhdogla4,3826
191
+ evalscope/perf/plugin/api/openai_api.py,sha256=I9yM4ouY1-xlBz4bYQ_62FZHKX4F3YCsg5GCqLU_9xA,6938
192
+ evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
193
+ evalscope/perf/plugin/datasets/base.py,sha256=1U_efZuU2ZdWV9UVAqFu1fx9_0PST_sJnaSIqbNvTF4,1787
194
+ evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
195
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=39jbcZde4cOY6PpJHeb20v5PIg58ezFMoXjYO7U6Z2A,1582
196
+ evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
197
+ evalscope/perf/plugin/datasets/longalpaca.py,sha256=ohmq3Mp0JKeG8h8ef9GYqN7pBLTHzpF8g9KrrriRbwM,1165
198
+ evalscope/perf/plugin/datasets/openqa.py,sha256=l9vCnEKBYU1a8uo49kArwSXu-ZaOXDHa2Pl3gp4yXE4,1395
199
+ evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
200
+ evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
+ evalscope/perf/utils/analysis_result.py,sha256=o0wMcr9U0Gwd5lh5tAFCFpp3FmfwsaMppyJOLI2_sJ8,1213
202
+ evalscope/perf/utils/benchmark_util.py,sha256=-wZyZnWrXsQOzPrWdxQVbQUVUAljzsfWV4-2Hw_xzpQ,5565
203
+ evalscope/perf/utils/chat_service.py,sha256=ncMmeUDpOo7Kjkhe_TPDZY8ffoHTCl-B5szHJ4gipEo,8642
204
+ evalscope/perf/utils/db_util.py,sha256=TeZzcGoWDde81EjpDOyV6c2B1ZM7NzRv-0cEmeorGjE,7356
205
+ evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
206
+ evalscope/perf/utils/local_server.py,sha256=AezbEdPGuE1esCBxXtXJWjFYTZfFb6SYC6bAfcaX5Gk,4316
165
207
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
208
+ evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
209
+ evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=IQmfcwkzCCV-bMbIC9M2fd-X99bHJ_r_qfIJjClClx0,2760
210
+ evalscope/registry/config/cfg_pairwise_baseline.yaml,sha256=d05pBiqOk1ejcdd9XE-opZ_ersyttAesF3Iwa2df8O8,3580
211
+ evalscope/registry/config/cfg_single.yaml,sha256=zjsUC3zhU8z7JURaJiz7npkUbFpP82q1ycqUmObC-hc,3056
212
+ evalscope/registry/data/question.jsonl,sha256=WQw5FXvFYerdfwPK1L4YwrWX-TApeAr2X4Zxjznq-oc,12885
213
+ evalscope/registry/data/prompt_template/lmsys_v2.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
214
+ evalscope/registry/data/prompt_template/prompt_templates.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
215
+ evalscope/registry/data/qa_browser/battle.jsonl,sha256=2MXcYoMItBmttQxSMh2Oa0x51xxqJaWEgSuERUx1O_0,1185590
216
+ evalscope/registry/data/qa_browser/category_mapping.yaml,sha256=3r9nUIciW9205qbtOQF7aI_etM191cM3vlWU8ueG2Co,484
166
217
  evalscope/registry/tasks/arc.yaml,sha256=phXsBLsAgvHWmU31J89QMnJJnUioRphraQrF9SrJ53c,863
167
218
  evalscope/registry/tasks/bbh.yaml,sha256=Ircb_-_eVri2B1MHeSrFs9vIol7RY8ZaWwdz1j57NHA,701
168
219
  evalscope/registry/tasks/bbh_mini.yaml,sha256=eZYash__XJcfJau0VqujehuYE2WnFzrWr9s9jCkNT8Q,775
@@ -175,7 +226,10 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=KYLK-xtv_3qtgCZiwwP4-rP_ftc_qUmtsl1Tf
175
226
  evalscope/registry/tasks/mmlu.yaml,sha256=504yhHVfi9pvUBk_SGPs-Yx7R2hx_2_-nAFiGIiFGx4,726
176
227
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=wVbosZ5Tm9pwLG5nCphalezXilIjcq5j33nz3MR7_BE,778
177
228
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
229
+ evalscope/third_party/longbench_write/README.md,sha256=p7C5StphFFzEeMA2lcfKyeBlJgJiIjTSXvzwhw9md2k,3248
178
230
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
231
+ evalscope/third_party/longbench_write/default_task.json,sha256=HPSnI7Ar7cqe86wzQnH2XsDtqmAuCDLy3sZm3MeNyKc,711
232
+ evalscope/third_party/longbench_write/default_task.yaml,sha256=aQB-Cn-gEkdoI_26yOaeJWGpoI3-FxHBclZGAmxeBcc,579
179
233
  evalscope/third_party/longbench_write/eval.py,sha256=_fwV3f-Yq0qrkuZ6LBXvBiXnM6lpz6sOqd7BfYxEU80,11163
180
234
  evalscope/third_party/longbench_write/infer.py,sha256=MB0MdSM1qDx15FyrPSU6BXPbSGnBjxuTWqrcHAgbj9o,8318
181
235
  evalscope/third_party/longbench_write/longbench_write.py,sha256=MQzlIzv3sGlNgxgX0FPHtDIuAmgwThfBkMeKNcsR3U8,3926
@@ -187,9 +241,13 @@ evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=
187
241
  evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
188
242
  evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
189
243
  evalscope/third_party/longbench_write/tools/data_etl.py,sha256=fSc4iT7_bdTvW20TbjlWme-k1pLqj_e2wXV8z831_Yw,5963
244
+ evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
190
245
  evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
246
+ evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
247
+ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2eexlehSi9LI4F3EPk-3JacrAb6ZoyxI,451
191
248
  evalscope/third_party/toolbench_static/eval.py,sha256=TqjMuuYePnD3bGRhQe1_9bIOlAW41kiFSztaEuppRLM,8237
192
249
  evalscope/third_party/toolbench_static/infer.py,sha256=WogwVXqDabdcsJ4uftZxAwR2wncp6HYpkS-fACEvjT4,9331
250
+ evalscope/third_party/toolbench_static/requirements.txt,sha256=JMIbWAfKRYcQh771IT-EjroMagXchYDSgfgY7gcqx08,21
193
251
  evalscope/third_party/toolbench_static/toolbench_static.py,sha256=uXvyeyNWTZHFVASnOeMf1sqHUjy9NQ3r8wbkhUQJL1g,1930
194
252
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
195
253
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=UywM8SU2ByFTzD4YkbB17SXJyxmzY1QDwARDuGzbCvs,1452
@@ -200,12 +258,29 @@ evalscope/tools/rewrite_eval_results.py,sha256=ZVi2hVjiTOmR_O5IaLv6qnQNpMz6FnDb9
200
258
  evalscope/utils/__init__.py,sha256=6RjACRYUSpGj6fkZ7NzYpl0lFppQCp9KVn5ktZe626s,128
201
259
  evalscope/utils/arena_utils.py,sha256=RMkymUv9Cxs37arUntzgDY5P0Dand2jGpsb7uy6wZmg,7670
202
260
  evalscope/utils/completion_parsers.py,sha256=61l8CTh1VxHgRoMDhtznpAhuJp47MssGgS-LdEe_h80,2997
203
- evalscope/utils/logger.py,sha256=cf3U400Mx1speMMNXorjwEE8noDz5Mbd-9PNgaulGeY,3013
261
+ evalscope/utils/logger.py,sha256=Nhm8u_Wpd5BlVPdv9IBW_M3XMEcp5UbkOf1oN2HvGG0,3060
204
262
  evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
205
263
  evalscope/utils/task_utils.py,sha256=IMtBSBUp3H95Ko0vn8Q55Wmz2SFZXSfjVy49tyomL_g,537
206
- evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
207
- evalscope-0.6.0rc0.dist-info/METADATA,sha256=w2k8y1h3gVVNAI7Ey-mc4RWsaSjNBlokuu0hw4e-3aI,21242
208
- evalscope-0.6.0rc0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
209
- evalscope-0.6.0rc0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
210
- evalscope-0.6.0rc0.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
211
- evalscope-0.6.0rc0.dist-info/RECORD,,
264
+ evalscope/utils/utils.py,sha256=bv_5zDNNzsODSwXz6M7TFkdfVJT6rw_orn_BG-qkijM,20567
265
+ tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
266
+ tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
267
+ tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
268
+ tests/cli/test_run.py,sha256=9GTF21NaUgERcF1Rkm9almO5-5pxsDF86Nw8fs8X7Hg,2926
269
+ tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
270
+ tests/perf/test_perf.py,sha256=Mn3nw2UJoR4qDLZ3Jhna3m52gD4mouc63uY_DLyXkG0,2889
271
+ tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
272
+ tests/rag/test_clip_benchmark.py,sha256=7NsOzgrpU9ou22M7fXtSFEnYt0iy2Q-ShIDL26Kp2gw,2597
273
+ tests/rag/test_mteb.py,sha256=MOksxYseIQ6SD_iFFxMC9BinvDtB0vlNSFEGJt0SGl8,4608
274
+ tests/rag/test_ragas.py,sha256=g3rAHymUzTyM6usIce6kItwyh1IocummK0BBPZiJPmY,4024
275
+ tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
276
+ tests/swift/test_run_swift_eval.py,sha256=Qop40c8jsHUbDTJe-Y8b_Aa8qn4Xstmu-FNGG14Gqik,5749
277
+ tests/swift/test_run_swift_vlm_eval.py,sha256=p2i2ZRj-vG1YsQGsemvQLHcyhjy1EmUChyAjFEmVbCE,4899
278
+ tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=c31jwQle_97ru5Dep91qsAqYjR1HDm1O9YZihRr0u0s,6018
279
+ tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
280
+ tests/vlm/test_vlmeval.py,sha256=21xi0nu4ghDB6_X-Pol7pTfK7aYkAYOp82TQ-MSQv-I,1757
281
+ evalscope-0.7.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
282
+ evalscope-0.7.0.dist-info/METADATA,sha256=W-NWOZwX9X-VN_LDI16aW6TxcOLJ3Um9dvms8bs28Bw,23796
283
+ evalscope-0.7.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
284
+ evalscope-0.7.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
285
+ evalscope-0.7.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
286
+ evalscope-0.7.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.38.4)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/cli/__init__.py ADDED
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
tests/cli/test_run.py ADDED
@@ -0,0 +1,76 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import subprocess
4
+ import unittest
5
+ from evalscope.utils import test_level_list, is_module_installed
6
+ from evalscope.utils.logger import get_logger
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ class TestRun(unittest.TestCase):
12
+
13
+ def setUp(self) -> None:
14
+ logger.info(f'Init env for evalscope native run UTs ...\n')
15
+ self._check_env('evalscope')
16
+
17
+ def tearDown(self) -> None:
18
+ pass
19
+
20
+ @staticmethod
21
+ def _check_env(module_name: str):
22
+ if is_module_installed(module_name):
23
+ logger.info(f'{module_name} is installed.')
24
+ else:
25
+ raise ModuleNotFoundError(f'run: pip install {module_name}')
26
+
27
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
28
+ def test_run_simple_eval(self):
29
+ model = 'ZhipuAI/chatglm3-6b'
30
+ template_type = 'chatglm3'
31
+ datasets = 'arc' # arc ceval
32
+ limit = 100
33
+
34
+ cmd_simple = f'python3 -m evalscope.run ' \
35
+ f'--model {model} ' \
36
+ f'--template-type {template_type} ' \
37
+ f'--datasets {datasets} ' \
38
+ f'--limit {limit}'
39
+
40
+ logger.info(f'Start to run command: {cmd_simple}')
41
+ run_res = subprocess.run(cmd_simple, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
42
+
43
+ assert run_res.returncode == 0, f'Failed to run command: {cmd_simple}'
44
+ logger.info(f'>>test_run_simple_eval stdout: {run_res.stdout}')
45
+ logger.error(f'>>test_run_simple_eval stderr: {run_res.stderr}')
46
+
47
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
48
+ def test_run_eval_with_args(self):
49
+ model = 'ZhipuAI/chatglm3-6b'
50
+ template_type = 'chatglm3'
51
+ datasets = 'arc ceval' # arc ceval
52
+ limit = 5
53
+ dataset_args = '{"ceval": {"few_shot_num": 0, "few_shot_random": false}}'
54
+
55
+ cmd_with_args = f'python3 -m evalscope.run ' \
56
+ f'--model {model} ' \
57
+ f'--template-type {template_type} ' \
58
+ f'--datasets {datasets} ' \
59
+ f'--limit {limit} ' \
60
+ f'--generation-config do_sample=false,temperature=0.0 ' \
61
+ f"""--dataset-args \'{dataset_args}\' """
62
+
63
+ logger.info(f'Start to run command: {cmd_with_args}')
64
+ run_res = subprocess.run(cmd_with_args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
65
+
66
+ assert run_res.returncode == 0, f'Failed to run command: {cmd_with_args}'
67
+ logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
68
+ logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
69
+
70
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
71
+ def test_run_eval_local(self):
72
+ ...
73
+
74
+
75
+ if __name__ == '__main__':
76
+ unittest.main()
tests/perf/__init__.py ADDED
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,96 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ import unittest
4
+
5
+ from evalscope.perf.main import run_perf_benchmark
6
+ from evalscope.utils import test_level_list
7
+
8
+
9
+ class TestPerf(unittest.TestCase):
10
+
11
+ def setUp(self) -> None:
12
+ pass
13
+
14
+ def tearDown(self) -> None:
15
+ pass
16
+
17
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
18
+ def test_run_perf(self):
19
+ task_cfg = {
20
+ 'url': 'http://127.0.0.1:8000/v1/chat/completions',
21
+ 'parallel': 1,
22
+ 'model': 'qwen2.5',
23
+ 'number': 15,
24
+ 'api': 'openai',
25
+ 'dataset': 'openqa',
26
+ 'debug': True,
27
+ }
28
+ run_perf_benchmark(task_cfg)
29
+
30
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
31
+ def test_run_perf_stream(self):
32
+ task_cfg = {
33
+ 'url': 'http://127.0.0.1:8000/v1/chat/completions',
34
+ 'parallel': 1,
35
+ 'model': 'qwen2.5',
36
+ 'number': 15,
37
+ 'api': 'openai',
38
+ 'dataset': 'openqa',
39
+ 'stream': True,
40
+ 'debug': True,
41
+ }
42
+ run_perf_benchmark(task_cfg)
43
+
44
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
45
+ def test_run_perf_speed_benchmark(self):
46
+ task_cfg = {
47
+ 'url': 'http://127.0.0.1:8000/v1/completions',
48
+ 'parallel': 1,
49
+ 'model': 'qwen2.5',
50
+ 'api': 'openai',
51
+ 'dataset': 'speed_benchmark',
52
+ 'debug': True,
53
+ }
54
+ run_perf_benchmark(task_cfg)
55
+
56
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
57
+ def test_run_perf_local(self):
58
+ task_cfg = {
59
+ 'parallel': 1,
60
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
61
+ 'number': 5,
62
+ 'api': 'local',
63
+ 'dataset': 'openqa',
64
+ 'debug': True,
65
+ }
66
+ run_perf_benchmark(task_cfg)
67
+
68
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
69
+ def test_run_perf_local_stream(self):
70
+ task_cfg = {
71
+ 'parallel': 1,
72
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
73
+ 'number': 5,
74
+ 'api': 'local',
75
+ 'dataset': 'openqa',
76
+ 'stream': True,
77
+ 'debug': True,
78
+ }
79
+ run_perf_benchmark(task_cfg)
80
+
81
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
82
+ def test_run_perf_local_speed_benchmark(self):
83
+ task_cfg = {
84
+ 'parallel': 1,
85
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
86
+ 'api': 'local_vllm',
87
+ 'dataset': 'speed_benchmark',
88
+ 'min_tokens': 2048,
89
+ 'max_tokens': 2048,
90
+ 'debug': True,
91
+ }
92
+ run_perf_benchmark(task_cfg)
93
+
94
+
95
+ if __name__ == '__main__':
96
+ unittest.main(buffer=False)
tests/rag/__init__.py ADDED
File without changes
@@ -0,0 +1,85 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import os
4
+
5
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
6
+ import subprocess
7
+ import unittest
8
+ from evalscope.utils import test_level_list, is_module_installed
9
+ from evalscope.utils.logger import get_logger
10
+ from evalscope.run import run_task
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestCLIPBenchmark(unittest.TestCase):
16
+
17
+ def setUp(self) -> None:
18
+ self._check_env("webdataset")
19
+
20
+ def tearDown(self) -> None:
21
+ pass
22
+
23
+ @staticmethod
24
+ def _check_env(module_name: str):
25
+ if is_module_installed(module_name):
26
+ logger.info(f"{module_name} is installed.")
27
+ else:
28
+ raise ModuleNotFoundError(f"run: pip install {module_name}")
29
+
30
+ @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
31
+ def test_run_task(self):
32
+ task_cfg = {
33
+ "eval_backend": "RAGEval",
34
+ "eval_config": {
35
+ "tool": "clip_benchmark",
36
+ "eval": {
37
+ "models": [
38
+ {
39
+ "model_name": "AI-ModelScope/chinese-clip-vit-large-patch14-336px",
40
+ }
41
+ ],
42
+ "dataset_name": ["muge", "mnist"],
43
+ "split": "test",
44
+ "batch_size": 128,
45
+ "num_workers": 1,
46
+ "verbose": True,
47
+ "skip_existing": False,
48
+ "output_dir": "outputs",
49
+ "cache_dir": "cache",
50
+ "limit": 1000,
51
+ },
52
+ },
53
+ }
54
+
55
+ run_task(task_cfg)
56
+
57
+ @unittest.skipUnless(0 in test_level_list(), "skip test in current test level")
58
+ def test_run_custom(self):
59
+ task_cfg = {
60
+ "eval_backend": "RAGEval",
61
+ "eval_config": {
62
+ "tool": "clip_benchmark",
63
+ "eval": {
64
+ "models": [
65
+ {
66
+ "model_name": "AI-ModelScope/chinese-clip-vit-large-patch14-336px",
67
+ }
68
+ ],
69
+ "dataset_name": ["custom"],
70
+ "data_dir": "custom_eval/multimodal/text-image-retrieval",
71
+ "split": "test",
72
+ "batch_size": 128,
73
+ "num_workers": 1,
74
+ "verbose": True,
75
+ "skip_existing": False,
76
+ "limit": 1000,
77
+ },
78
+ },
79
+ }
80
+
81
+ run_task(task_cfg)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ unittest.main(buffer=False)