evalscope 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (68) hide show
  1. evalscope/backend/base.py +1 -1
  2. evalscope/backend/rag_eval/utils/clip.py +2 -2
  3. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  5. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
  6. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
  8. evalscope/benchmarks/race/race_adapter.py +2 -1
  9. evalscope/config.py +38 -2
  10. evalscope/constants.py +24 -38
  11. evalscope/evaluator/__init__.py +0 -1
  12. evalscope/evaluator/evaluator.py +6 -4
  13. evalscope/evaluator/rating_eval.py +1 -1
  14. evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
  15. evalscope/models/model_adapter.py +1 -1
  16. evalscope/perf/arguments.py +3 -1
  17. evalscope/perf/benchmark.py +3 -3
  18. evalscope/perf/main.py +5 -6
  19. evalscope/perf/plugin/api/openai_api.py +53 -49
  20. evalscope/perf/plugin/registry.py +3 -3
  21. evalscope/perf/utils/benchmark_util.py +4 -4
  22. evalscope/perf/utils/db_util.py +66 -22
  23. evalscope/perf/utils/local_server.py +4 -1
  24. evalscope/run.py +45 -82
  25. evalscope/run_arena.py +2 -1
  26. evalscope/summarizer.py +14 -26
  27. evalscope/third_party/longbench_write/eval.py +2 -1
  28. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  29. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  30. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  31. evalscope/tools/combine_reports.py +2 -4
  32. evalscope/tools/rewrite_eval_results.py +1 -1
  33. evalscope/utils/__init__.py +1 -0
  34. evalscope/utils/chat_service.py +1 -1
  35. evalscope/utils/io_utils.py +162 -0
  36. evalscope/utils/logger.py +8 -0
  37. evalscope/utils/utils.py +0 -175
  38. evalscope/version.py +2 -2
  39. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/METADATA +15 -3
  40. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/RECORD +47 -67
  41. tests/cli/test_run.py +11 -12
  42. tests/perf/test_perf.py +3 -2
  43. tests/vlm/test_vlmeval.py +3 -2
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  52. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  53. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  54. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  55. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  56. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  57. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  58. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  59. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  60. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  61. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  62. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  63. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  64. evalscope/evaluator/humaneval_evaluator.py +0 -158
  65. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/LICENSE +0 -0
  66. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/WHEEL +0 -0
  67. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/entry_points.txt +0 -0
  68. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
1
  evalscope/__init__.py,sha256=RY0EjssSquqqsysRobElYm9Ix6E41uTXeaeh7lI7kqs,106
2
2
  evalscope/arguments.py,sha256=nozBnog45l77jxTFH_lyyJkj04ER3yyIpICepc2tC1Y,3783
3
- evalscope/config.py,sha256=KYS_O0RdAbruQhqP6mp3rQL0003Oaskx03IroZUGRps,6897
4
- evalscope/constants.py,sha256=D2MU7bs_qwmcHQ1ge05C5Ekk04XqMyiGxssvKwAecxI,4515
5
- evalscope/run.py,sha256=5cG81qfdpMN_GtPphvJ7BHboD6LBYHWyodX8ViR1XL4,8874
6
- evalscope/run_arena.py,sha256=Kmzak4TGdATbOhOCe_zLLRxDvgtkOfs6e4VaxOAzPKk,8550
7
- evalscope/summarizer.py,sha256=Eq7ZqGKuvrhWVeGriLxHCGupgnJmtvmIGqZYzRNaY8I,6480
8
- evalscope/version.py,sha256=Xha7v5_YH0Oppyh6iO7HrpSsmv1WCPdQPFtzYTJvG4A,118
3
+ evalscope/config.py,sha256=_4IRpoAssdHEg75UKPKVw6FVaCu2NaP2aOMA5DRsuGU,8444
4
+ evalscope/constants.py,sha256=M5qJ8b7kp-RF52IwBjx5EMjeuiH1e1jdollCsbIT-c4,3753
5
+ evalscope/run.py,sha256=s_qE1ukrt4HBfRVAPJjC1XiqD9k7rSH7lX8yysyf5do,7279
6
+ evalscope/run_arena.py,sha256=6nc_S8KL7B3V4SsnpIexfvczHN9kQwHR9R1GXb2sqgI,8586
7
+ evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
8
+ evalscope/version.py,sha256=uvEbCM3fC0oZ2Rt82Q0oErXsM-iYBNxJtPPLXPwscAU,118
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- evalscope/backend/base.py,sha256=l7zUHXX2XToIfU_hkVeTSHT9wWURYumyohXCIgywZBI,1021
10
+ evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
12
12
  evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
13
13
  evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-oyH3uMGL7M3nPp1WiHU,10381
@@ -42,34 +42,14 @@ evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3H
42
42
  evalscope/backend/rag_eval/ragas/arguments.py,sha256=8SYCV15d25ocdDHRqmGMQzd9zR6gwfOrVSFBe4T-KCo,1806
43
43
  evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
44
44
  evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
45
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=4wPfjNh-OVFQdvho3CAJ66_B2TZuRZVm6-xUIXokKcY,3935
46
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=wWidnp8726hf6-fY31ZoqCt9zhZgVM260o8MwdBI0d8,1737
47
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=o5RXPz-O1JM8gFRCLCY2iobh0uLc4mznT_zLCpWaPFE,968
48
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=eEs6gdAKuYfDohCz9EzM1o0ykIEUbvwoRu1Pd2dL92E,3168
49
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json,sha256=qZhHR9Ki374Ykb6V8dmptE1whXmPKRvAJ0Gl2akoaX0,216
50
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=k5LjoxcIDM9Yvj0h5bje6ANXEOgFbioRs1i23259Md8,2486
51
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=Pn1rGIjfyIeY6BZQEOeR4v-QC5xcmTN6aIh0G2E2Xuo,1740
52
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=p7RrFdNWY1Wo5s03SvtXQSZ-CEn96NkPZ3EHsJ3UIFE,1137
53
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=s2mlf9BTWnmnCZ9H3yLZgPvPUPWnPgIIDtRtH0qStMM,991
54
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=s_6K6surhTGpr5efryHjW-PFDKlYJTTpgXDlC_TbzVw,1943
55
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
56
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=p-vCyibNNezGcuID2kGvBDZJGdPXm3NvTTVvH6ij7N4,1973
57
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
58
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=yayuzrNO2EO9eIqSv5mthNTVXnw_7D_HOJZ_tse-qw0,1374
59
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=-rOBZuhZGbVrlti3PycavxAoInEry3dMYt9VN3Qvo-E,1475
60
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=svZ_xzfQp3KMzdVJoqTVPGnwgls2JjXXplTcUj1jVFo,767
61
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=VRO9Hy-e5Dba1AkLqxj2R-Ezwoby3BvipM9zNlZJ4GY,1328
62
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
63
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=1YVcklCc4otS0mkO0aiNNFx7Zecc1L3wB6ol3NPxTt0,697
64
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=c70_FGepQm3_dZngdjNudX_iCmu39tvZncyBqNxMrfg,658
65
45
  evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
66
46
  evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
67
47
  evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
68
48
  evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=B5ZETlQw5XTEDnO-VR5yXjSbbg1eUtjGts7M5msK2ik,5618
69
49
  evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_YF82SXLpkxoJ4nUurmdKSEoJ-qsLY,2129
70
50
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
- evalscope/backend/rag_eval/utils/clip.py,sha256=WZovQJGyPI33Y-9bUnanR6fIYJzrXgnjD4zVwUJSgCw,5002
72
- evalscope/backend/rag_eval/utils/embedding.py,sha256=XWI07YeWDALc2etP4DGluYqrid85nKz1tjM91JLZRmM,6252
51
+ evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
52
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
73
53
  evalscope/backend/rag_eval/utils/llm.py,sha256=619eP8pXUcwIBaktBrGNA17j53j9jfg_1JeFDYzMCIE,2582
74
54
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
75
55
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
@@ -122,23 +102,23 @@ evalscope/benchmarks/competition_math/__init__.py,sha256=CDK03RXT-X21WcIAlkrCs0r
122
102
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
123
103
  evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=cHWJ6LLIWvftFXjGrOidMlZ1RGUFxPgDjs4wmBPSm1Y,18862
124
104
  evalscope/benchmarks/general_qa/__init__.py,sha256=N2t-ehNrl9eVAarlSgJvRapm9yOjhfCWhNPPfcUUy-s,409
125
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=Y7_d6hmh94W2XbzUnDMX9_uKWcarK0zv4Q4mQWUfSZ8,5869
105
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=cSW0Mq9__-gh-tVoVXD9Rk6h3h2iZW-Fu3RQ16haJhQ,5878
126
106
  evalscope/benchmarks/gsm8k/__init__.py,sha256=CtcG_QM8m5zmvMs2N53d7kcm4_hIgsO2qYPyx-71aLw,313
127
107
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
128
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=gg65W_pz4mPOBUOwaYIgfUxGKzrmRZRuoEg5xtS8bYg,13830
108
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=KBI9t5F7XW1Cs44QUA7ultkfsXxLyucH9zNYe-jOQQk,13866
129
109
  evalscope/benchmarks/hellaswag/__init__.py,sha256=cY1kluaTqC7AvyzwlQYc3BF_kB3LD1gOpg6i7RDr0cI,415
130
110
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
131
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=7REJeC8vD8OVtmcqI5TP6cTn88-KOzBs5oOKEZEmESs,8459
111
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=IIesSMPw1Yya4-LjqJt1QVkpOx8RGKwBYTQtmc0VfaQ,8495
132
112
  evalscope/benchmarks/humaneval/__init__.py,sha256=lqSlAf1-8Nzhc1j89sj6yAcaLt9pGhqu15M84bmzamc,333
133
113
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
134
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=HxAjkIA-Wt5-wb8kNSDMzZRoHflgsNxIfa1BoeVzwog,1660
114
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=VAO7siedusq9z3b1J3ztFE4XDopYKqmwe2n-Numg7HY,9149
135
115
  evalscope/benchmarks/mmlu/__init__.py,sha256=OGiN1J80WDM72y242o7diYT9Rl-jkVEqTNntCl8Vt4M,385
136
116
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
137
117
  evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8T-fN_Az0gWOyME9nHl3MvcD144TjWknFKcEOMHppAI,15494
138
118
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
139
119
  evalscope/benchmarks/race/__init__.py,sha256=HVda-CB-Q-N8RbwiVLADXYNY6VLUH-frJ8VCc3jm0Mk,385
140
120
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
141
- evalscope/benchmarks/race/race_adapter.py,sha256=Ppo7bttx15zB-m-UtguIwIXgqpEKAi_ClIOol0hPQiE,9805
121
+ evalscope/benchmarks/race/race_adapter.py,sha256=WgnWYSctc3VtWm2FAeVDTlxR2hwXsF2tala7n66f5mw,9841
142
122
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
143
123
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=eLMVC6tfwty5HqrQuGyWeAF2IhRNajWoO1SkLVemQj4,409
144
124
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
@@ -153,12 +133,11 @@ evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
153
133
  evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
154
134
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
155
135
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
156
- evalscope/evaluator/__init__.py,sha256=h_EyZm7vDqBsGx6CkoQVLg0aMy0tE_IG5uEnheubb0s,174
157
- evalscope/evaluator/evaluator.py,sha256=MGkuJi9o5Hdbj_fN7qolDqP0B47i9i0ksGd1uc-TMn0,18365
158
- evalscope/evaluator/humaneval_evaluator.py,sha256=245XRxwulGQpjdapwU8CiYJn1xT0XKxl7hdWvzFxLG0,5964
159
- evalscope/evaluator/rating_eval.py,sha256=VuDIZcmSlsv1tc8znDGesz8ZwpQ7NvZJPv823Quvht0,5566
136
+ evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
137
+ evalscope/evaluator/evaluator.py,sha256=wrTWyvyD1eqSvsZRwDRV1UVBxXv7y-2A29UCD9F-5qI,18412
138
+ evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
160
139
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
161
- evalscope/evaluator/reviewer/auto_reviewer.py,sha256=YVTJAHK0uz9hNupsdeTXMM2PISECf8phXq0GYPr4law,16378
140
+ evalscope/evaluator/reviewer/auto_reviewer.py,sha256=nL8k-i92L1iMwjPOnNxzQyZICfukZKJul4ZBvOWkHGw,16414
162
141
  evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
163
142
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
164
143
  evalscope/metrics/math_accuracy.py,sha256=WqLfACuIeVFrX4q6_c2exnTLn2t10-rjv6sfxcqJJ14,1965
@@ -171,24 +150,24 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk
171
150
  evalscope/models/__init__.py,sha256=b-jXJ2Cj6dH8notAU7lvCVKbGrcEaf8Gfr5w79qNHAk,111
172
151
  evalscope/models/dummy_chat_model.py,sha256=aG3yolnnIN_-gsfF9FsyjyGMewQteEnUfOxTGScROSE,1272
173
152
  evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
174
- evalscope/models/model_adapter.py,sha256=XBeSFTR9pXmnhFWRRddcobnITC5T4JKooeFUeWEtUVI,19006
153
+ evalscope/models/model_adapter.py,sha256=5jzDXpFp24ZZ25tjpIMJeDTz-lDSD_EHp040gJOZACc,19007
175
154
  evalscope/models/openai_model.py,sha256=-tPBu6v0Ogf_flmG88tFuu66QNKrOyxv3AjYwVtuR44,3313
176
155
  evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
177
156
  evalscope/models/api/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
178
157
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
179
158
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
180
159
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
- evalscope/perf/arguments.py,sha256=_gW1tq7SbrAZd05N-FbY_oWrQB0Djs4KUaFdXSfFsr8,9112
182
- evalscope/perf/benchmark.py,sha256=ff9PFFMY5UucuUihcdo6lSf1X9XXoaOmrpBvjDk5Mrw,9599
160
+ evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
161
+ evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
183
162
  evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
184
- evalscope/perf/main.py,sha256=-8NsvJZ7uyVfJT9N2lX36KfsHkVTy0r8OcsWPYoKms0,1316
163
+ evalscope/perf/main.py,sha256=Qg99KhGUjnVAMkNofbDsvMGFxijewH8ri3QoW1y1U7U,1292
185
164
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
186
- evalscope/perf/plugin/registry.py,sha256=PyK3E1AqQFuU4Bs9COvFFCJOaCtmHbfeQOVGtjVYh-I,1304
165
+ evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
187
166
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
188
167
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
189
168
  evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
190
169
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
191
- evalscope/perf/plugin/api/openai_api.py,sha256=KRN6EjObTG08mcI82kJD3dGK7DoVMUZzrUZ1AgoLEp0,7007
170
+ evalscope/perf/plugin/api/openai_api.py,sha256=WV2EUIl1PTg-Dj7HMSxJrAE7OUxJZqQmZLJZLHffcJo,6805
192
171
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
193
172
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
194
173
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
@@ -199,10 +178,10 @@ evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1
199
178
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
200
179
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
180
  evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0awWiXEIkCI,1212
202
- evalscope/perf/utils/benchmark_util.py,sha256=xFZSSUoBoFpHRZC69-KS9cK2vqJlL7rIuCEz_MnpnGA,5564
203
- evalscope/perf/utils/db_util.py,sha256=A2K3otCrNw3K1SMwoYo8a6jekT5nAVvWJepqi31DH28,7479
181
+ evalscope/perf/utils/benchmark_util.py,sha256=T_pXpSCwCNLJgfzgv3IO7kG61ghTLthVMsXZhBCGP_4,5541
182
+ evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
204
183
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
205
- evalscope/perf/utils/local_server.py,sha256=31EQZ8S_SzgSiBFpc9zRU13GXm2jREvRmPDN5qWKgbg,4468
184
+ evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
206
185
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
207
186
  evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
208
187
  evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -229,9 +208,9 @@ evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86J
229
208
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
230
209
  evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
231
210
  evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
232
- evalscope/third_party/longbench_write/eval.py,sha256=bZrpaKg9sPXv2VkUxLpfJiNqMIoIj7Pf3eFMqmDncyY,11229
211
+ evalscope/third_party/longbench_write/eval.py,sha256=39McZSDHL7bA5Dg-BSyZ4EiAF1nfTiYJAnx5FqbNYok,11265
233
212
  evalscope/third_party/longbench_write/infer.py,sha256=bFsOp--8Qn6qQ-NpdLY0bennQGQl5TMGEngvGda8k7g,4937
234
- evalscope/third_party/longbench_write/longbench_write.py,sha256=1caNiJvmZL2vwDU6oHUE4cdCViZGYE8yBo9EsMcA-Qw,3955
213
+ evalscope/third_party/longbench_write/longbench_write.py,sha256=nIR1toB1hvUXR7Lrs3xcY9wqaI-bjeADg_Oscf3HdaY,3991
235
214
  evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
236
215
  evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
237
216
  evalscope/third_party/longbench_write/resources/judge.txt,sha256=Go1ISY4bUBmEDXXY_DItjAmskuHSaRj5WTNMNH98FSk,1885
@@ -239,7 +218,7 @@ evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26
239
218
  evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
240
219
  evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
241
220
  evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
242
- evalscope/third_party/longbench_write/tools/data_etl.py,sha256=nmWKOrD-GeZi0ZGH5jLCGuW3qiLTui8ASSxI2z8l6ls,5962
221
+ evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
243
222
  evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
244
223
  evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
245
224
  evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
@@ -247,26 +226,27 @@ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2ee
247
226
  evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
248
227
  evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
249
228
  evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
250
- evalscope/third_party/toolbench_static/toolbench_static.py,sha256=y4nC9WCBCgBg378aWYAdhmrFte_r_XOkigJs7XJ_iXQ,1930
229
+ evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
251
230
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
252
231
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
253
232
  evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
254
- evalscope/tools/combine_reports.py,sha256=1BJ29IEUKoZLM3NAzg_IpU8B9uhljO9-b_hqAYi9RpA,5078
233
+ evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
255
234
  evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
256
- evalscope/tools/rewrite_eval_results.py,sha256=2lbDHfF_9abK1tUk2UYZZRwzO68eoiE36dXyh_b-mwg,2011
257
- evalscope/utils/__init__.py,sha256=hDS1xpoAxtVH4-ZQOXstdg7WYmjcGPQ62Kh54FIgkwU,87
235
+ evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
236
+ evalscope/utils/__init__.py,sha256=ZOri8VHx8LpJBJS90uw8h0Z7gPhtxhjWlBPWuuZgoRE,121
258
237
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
259
- evalscope/utils/chat_service.py,sha256=N8lJPiVtzdqsHypa42wzb15T7hduXUrRPtU3Atf8yg4,8641
238
+ evalscope/utils/chat_service.py,sha256=VdNPXdFSf-4zxe0Ht74LBcdRNbpb9vzVi86HDEqfXHc,8647
260
239
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
261
- evalscope/utils/logger.py,sha256=IkY0oxkWSvfA0z1m79crioTiqQcnxulNF5HtJNlV0Fc,3174
240
+ evalscope/utils/io_utils.py,sha256=MnEi4llOYtXK81bUQ_XE_WP5qIsVrJ4MlKmWMH9vzFs,3993
241
+ evalscope/utils/logger.py,sha256=4OGlkBsut_wzq-1UcM2DKQKdKs1FRNYGHw538TGvypU,3440
262
242
  evalscope/utils/model_utils.py,sha256=zMS1YRu4CzU4CVLZS6e_lgfHIDBqv3YBTJbPF1R2M90,443
263
- evalscope/utils/utils.py,sha256=PVtpv3WAIm6Bs66Vz4KBDiAiXp8y6Oejxxr1LWHTRsI,15146
243
+ evalscope/utils/utils.py,sha256=lZl5lt4WqjoY5SEfsum8Sc-s_c9GSlmIZlkTAQkMnjE,10485
264
244
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
265
245
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
266
246
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
267
- tests/cli/test_run.py,sha256=lXR35DDLQjdb-XGA6pKnQC9pJTfTOHjknAN7PEaw8G4,4334
247
+ tests/cli/test_run.py,sha256=pMZvI3b0Vs-UFfciDoPwCYFAaYJzocQjxEaMLFTxYSo,4289
268
248
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
269
- tests/perf/test_perf.py,sha256=GD5nInXpQG7H1E8wI6dvy4DFSvTEddGDzv-Cu8YV1ts,2995
249
+ tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
270
250
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
271
251
  tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
272
252
  tests/rag/test_mteb.py,sha256=CaEJ0f1M06Z90c72FQb9z23IC_KZtkURWsc_oRMgQn8,4609
@@ -276,10 +256,10 @@ tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p
276
256
  tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
277
257
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
278
258
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
279
- tests/vlm/test_vlmeval.py,sha256=21xi0nu4ghDB6_X-Pol7pTfK7aYkAYOp82TQ-MSQv-I,1757
280
- evalscope-0.8.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
281
- evalscope-0.8.0.dist-info/METADATA,sha256=5RKZaNBwuJj84sdAXlNmT11Bm8kGYha6EYnqszwZ1Qk,23190
282
- evalscope-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
283
- evalscope-0.8.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
284
- evalscope-0.8.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
285
- evalscope-0.8.0.dist-info/RECORD,,
259
+ tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
260
+ evalscope-0.8.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
261
+ evalscope-0.8.2.dist-info/METADATA,sha256=Fk1p0gh2RycQ7yOBj7fMYym7G-SYj8sL-IZX8cgGxVQ,23709
262
+ evalscope-0.8.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
263
+ evalscope-0.8.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
264
+ evalscope-0.8.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
265
+ evalscope-0.8.2.dist-info/RECORD,,
tests/cli/test_run.py CHANGED
@@ -70,7 +70,7 @@ class TestRun(unittest.TestCase):
70
70
 
71
71
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
72
72
  def test_run_task(self):
73
- task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['gsm8k'], 'limit': 2, 'debug': False}
73
+ task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'gsm8k', 'arc'], 'limit': 2, 'debug': False}
74
74
  run_task(task_cfg=task_cfg)
75
75
 
76
76
 
@@ -80,33 +80,32 @@ class TestRun(unittest.TestCase):
80
80
 
81
81
  task_cfg = TaskConfig(
82
82
  model='qwen/Qwen2-0.5B-Instruct',
83
- datasets=['ceval'], # 数据格式,选择题格式固定为 'ceval'
83
+ datasets=['ceval', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
84
84
  dataset_args={
85
85
  'ceval': {
86
86
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
87
87
  'subset_list': [
88
88
  'example' # 评测数据集名称,上述 *_dev.csv 中的 *
89
89
  ]
90
+ },
91
+ 'general_qa': {
92
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
93
+ 'subset_list': [
94
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
95
+ ]
90
96
  }
91
97
  },
92
98
  )
93
99
  run_task(task_cfg=task_cfg)
94
100
 
95
101
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
96
- def test_run_custom_qa(self):
102
+ def test_run_humaneval(self):
97
103
  from evalscope.config import TaskConfig
98
104
 
99
105
  task_cfg = TaskConfig(
100
106
  model='qwen/Qwen2-0.5B-Instruct',
101
- datasets=['general_qa'], # 数据格式,选择题格式固定为 'ceval'
102
- dataset_args={
103
- 'general_qa': {
104
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
105
- 'subset_list': [
106
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
107
- ]
108
- }
109
- },
107
+ datasets=['humaneval'],
108
+ limit=2
110
109
  )
111
110
 
112
111
  run_task(task_cfg=task_cfg)
tests/perf/test_perf.py CHANGED
@@ -19,12 +19,13 @@ class TestPerf(unittest.TestCase):
19
19
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
20
20
  def test_run_perf(self):
21
21
  task_cfg = {
22
- 'url': 'http://127.0.0.1:8000/v1/chat/completions',
22
+ 'url': 'http://127.0.0.1:8001/v1/chat/completions',
23
23
  'parallel': 1,
24
24
  'model': 'qwen2.5',
25
25
  'number': 15,
26
26
  'api': 'openai',
27
27
  'dataset': 'openqa',
28
+ # 'stream': True,
28
29
  'debug': True,
29
30
  }
30
31
  run_perf_benchmark(task_cfg)
@@ -46,7 +47,7 @@ class TestPerf(unittest.TestCase):
46
47
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
47
48
  def test_run_perf_speed_benchmark(self):
48
49
  task_cfg = {
49
- 'url': 'http://127.0.0.1:8000/v1/completions',
50
+ 'url': 'http://127.0.0.1:8001/v1/completions',
50
51
  'parallel': 1,
51
52
  'model': 'qwen2.5',
52
53
  'api': 'openai',
tests/vlm/test_vlmeval.py CHANGED
@@ -40,8 +40,9 @@ class TestVLMEval(unittest.TestCase):
40
40
  }], # model name for VLMEval config
41
41
  'nproc': 1,
42
42
  'reuse': True,
43
- 'work_dir': 'outputs'
44
- }
43
+ },
44
+ 'work_dir': 'outputs',
45
+ 'use_cache': 'outputs/20241216_142838'
45
46
  }
46
47
 
47
48
  logger.info(f'>> Start to run task: {task_cfg}')
@@ -1,87 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -492257975294377194,
4
- "language": "chinese",
5
- "instruction": "给定一个真实情况和一个答案陈述,分析每个陈述并将其分类为以下类别之一:TP(真正):答案中存在的陈述也直接由一个或多个真实情况中的陈述支持,FP(假正):答案中存在的陈述但没有被任何真实情况中的陈述直接支持,FN(假负):在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于其中一个类别。为每个分类提供理由。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "是什么为太阳提供能量,它的主要功能是什么?",
10
- "answer": [
11
- "太阳的能量来自核裂变,类似于地球上的核反应堆。",
12
- "太阳的主要功能是为太阳系提供光。"
13
- ],
14
- "ground_truth": [
15
- "太阳的能量来自核聚变,其中氢原子融合形成氦。",
16
- "太阳核心的这种聚变过程释放出巨大的能量。",
17
- "来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
18
- "太阳的光在地球的气候系统中起着关键作用。",
19
- "阳光有助于驱动天气和海洋洋流。"
20
- ]
21
- },
22
- "output": {
23
- "TP": [
24
- {
25
- "statement": "太阳的主要功能是为太阳系提供光。",
26
- "reason": "这一说法在某种程度上得到了地面事实的支持,提到太阳提供光和它的作用,尽管它更广泛地关注太阳的能量。"
27
- }
28
- ],
29
- "FP": [
30
- {
31
- "statement": "太阳的能量来自核裂变,类似于地球上的核反应堆。",
32
- "reason": "这一说法是不正确的,与地面事实相矛盾,地面事实指出太阳的能量来自核聚变。"
33
- }
34
- ],
35
- "FN": [
36
- {
37
- "statement": "太阳的能量来自核聚变,其中氢原子融合形成氦。",
38
- "reason": "这种对太阳能量来源的准确描述没有包含在答案中。"
39
- },
40
- {
41
- "statement": "太阳核心的这种聚变过程释放出巨大的能量。",
42
- "reason": "这个过程及其重要性没有在答案中提到。"
43
- },
44
- {
45
- "statement": "来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
46
- "reason": "答案中只提到了光,忽略了热量及其对生命的必要性,这些在地面事实中都有涵盖。"
47
- },
48
- {
49
- "statement": "太阳的光在地球的气候系统中起着关键作用。",
50
- "reason": "太阳光对地球气候系统的这种更广泛的影响没有在答案中提到。"
51
- },
52
- {
53
- "statement": "阳光有助于驱动天气和海洋洋流。",
54
- "reason": "答案中省略了阳光对天气模式和海洋洋流的影响。"
55
- }
56
- ]
57
- }
58
- },
59
- {
60
- "input": {
61
- "question": "水的沸点是多少?",
62
- "answer": [
63
- "水的沸点在海平面上是100摄氏度。"
64
- ],
65
- "ground_truth": [
66
- "水的沸点在海平面上是100摄氏度(212华氏度)。",
67
- "水的沸点会随着海拔的变化而变化。"
68
- ]
69
- },
70
- "output": {
71
- "TP": [
72
- {
73
- "statement": "水的沸点在海平面上是100摄氏度。",
74
- "reason": "这一说法直接得到了地面事实的支持,地面事实具体说明了水的沸点在海平面上是100摄氏度。"
75
- }
76
- ],
77
- "FP": [],
78
- "FN": [
79
- {
80
- "statement": "水的沸点会随着海拔的变化而变化。",
81
- "reason": "关于水的沸点如何随海拔变化的额外信息没有在答案中提到。"
82
- }
83
- ]
84
- }
85
- }
86
- ]
87
- }
@@ -1,36 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -8546983388246528139,
4
- "language": "chinese",
5
- "instruction": "给定一个问题、一个答案和答案中的句子,分析在“句子”下给出的每个句子的复杂性,并将每个句子分解为一个或多个完全可理解的陈述,同时确保每个陈述中不使用代词。将输出格式化为JSON。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "阿尔伯特·爱因斯坦是谁,他以什么而闻名?",
10
- "answer": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最著名的是发展了相对论,他还对量子力学理论的发展做出了重要贡献。",
11
- "sentences": {
12
- "0": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
13
- "1": "他最著名的是发展了相对论,他还对量子力学理论的发展做出了重要贡献。"
14
- }
15
- },
16
- "output": {
17
- "sentences": [
18
- {
19
- "sentence_index": 0,
20
- "simpler_statements": [
21
- "阿尔伯特·爱因斯坦是一位出生于德国的理论物理学家。",
22
- "阿尔伯特·爱因斯坦被认为是有史以来最伟大和最有影响力的物理学家之一。"
23
- ]
24
- },
25
- {
26
- "sentence_index": 1,
27
- "simpler_statements": [
28
- "阿尔伯特·爱因斯坦最著名的是发展了相对论。",
29
- "阿尔伯特·爱因斯坦还对量子力学理论的发展做出了重要贡献。"
30
- ]
31
- }
32
- ]
33
- }
34
- }
35
- ]
36
- }
@@ -1,26 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 7951911230338252816,
4
- "language": "chinese",
5
- "instruction": "为给定的答案生成一个问题,并识别答案是否含糊不清。如果答案含糊不清,则给出1;如果答案明确,则给出0。含糊不清的答案是指那些回避的、模糊的或不明确的答案。例如,“我不知道”或“我不确定”是含糊不清的答案。",
6
- "examples": [
7
- {
8
- "input": {
9
- "response": "阿尔伯特·爱因斯坦出生在德国。"
10
- },
11
- "output": {
12
- "question": "阿尔伯特·爱因斯坦出生在哪里?",
13
- "noncommittal": 0
14
- }
15
- },
16
- {
17
- "input": {
18
- "response": "我不知道2023年发明的智能手机的突破性功能,因为我对2022年以后的信息不了解。"
19
- },
20
- "output": {
21
- "question": "2023年发明的智能手机的突破性功能是什么?",
22
- "noncommittal": 1
23
- }
24
- }
25
- ]
26
- }
@@ -1,41 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -5318808809674890018,
4
- "language": "chinese",
5
- "instruction": "给定问题、答案和背景,验证背景在得出给定答案时是否有用。如果有用,判定为“1”,如果没有用,判定为“0”,并以json格式输出。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "你能告诉我关于阿尔伯特·爱因斯坦的什么?",
10
- "context": "阿尔伯特·爱因斯坦(1879年3月14日-1955年4月18日)是一位德国出生的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因发展相对论而闻名,同时也对量子力学做出了重要贡献,因此在20世纪前几十年现代物理学对自然科学理解的革命性重塑中起到了核心作用。他的质能等价公式E=mc²,源于相对论,被称为“世界上最著名的方程”。他因“对理论物理学的贡献,特别是发现光电效应定律”而获得1921年诺贝尔物理学奖,这是量子理论发展的关键一步。他的工作也因其对科学哲学的影响而闻名。在1999年由英国《物理世界》杂志对全球130位顶尖物理学家的调查中,爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和原创性使爱因斯坦成为天才的代名词。",
11
- "answer": "阿尔伯特·爱因斯坦,生于1879年3月14日,是一位德国出生的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因对理论物理学的贡献而获得1921年诺贝尔物理学奖。"
12
- },
13
- "output": {
14
- "reason": "提供的背景确实有助于得出给定的答案。背景包括关于阿尔伯特·爱因斯坦的生活和贡献的关键信息,这些信息在答案中得到了反映。",
15
- "verdict": 1
16
- }
17
- },
18
- {
19
- "input": {
20
- "question": "谁赢得了2020年ICC世界杯?",
21
- "context": "2022年ICC男子T20世界杯于2022年10月16日至11月13日在澳大利亚举行,是该赛事的第八届。原定于2020年举行,但因COVID-19大流行而推迟。英格兰在决赛中以五个小门击败巴基斯坦,赢得了他们的第二个ICC男子T20世界杯冠军。",
22
- "answer": "英格兰"
23
- },
24
- "output": {
25
- "reason": "背景有助于澄清关于2020年ICC世界杯的情况,并指出英格兰是原定于2020年举行但实际上在2022年举行的比赛的获胜者。",
26
- "verdict": 1
27
- }
28
- },
29
- {
30
- "input": {
31
- "question": "世界上最高的山是什么?",
32
- "context": "安第斯山脉是世界上最长的大陆山脉,位于南美洲。它横跨七个国家,拥有西半球许多最高的山峰。该山脉以其多样的生态系统而闻名,包括高海拔的安第斯高原和亚马逊雨林。",
33
- "answer": "珠穆朗玛峰。"
34
- },
35
- "output": {
36
- "reason": "提供的背景讨论了安第斯山脉,虽然令人印象深刻,但不包括珠穆朗玛峰,也与关于世界最高山的问题没有直接关系。",
37
- "verdict": 0
38
- }
39
- }
40
- ]
41
- }
@@ -1,7 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -1333942410710431097,
4
- "language": "chinese",
5
- "instruction": "给定文档摘要和节点内容,将节点内容评分在1到5的范围内。",
6
- "examples": []
7
- }
@@ -1,60 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 5296785184599215999,
4
- "language": "chinese",
5
- "instruction": "您的任务是根据给定的上下文判断一系列陈述的真实性。对于每个陈述,如果可以根据上下文直接推断出该陈述,则必须返回判决为1;如果不能根据上下文直接推断出该陈述,则返回判决为0。",
6
- "examples": [
7
- {
8
- "input": {
9
- "context": "约翰是XYZ大学的学生。他正在攻读计算机科学学位。本学期他注册了几门课程,包括数据结构、算法和数据库管理。约翰是一个勤奋的学生,花费大量时间学习和完成作业。他经常在图书馆待到很晚以完成他的项目。",
10
- "statements": [
11
- "约翰主修生物学。",
12
- "约翰正在学习一门人工智能课程。",
13
- "约翰是一个勤奋的学生。",
14
- "约翰有一份兼职工作。"
15
- ]
16
- },
17
- "output": {
18
- "statements": [
19
- {
20
- "statement": "约翰主修生物学。",
21
- "reason": "约翰的专业明确提到是计算机科学。没有信息表明他主修生物学。",
22
- "verdict": 0
23
- },
24
- {
25
- "statement": "约翰正在学习一门人工智能课程。",
26
- "reason": "上下文中提到约翰目前注册的课程,并未提到人工智能。因此,不能推断出约翰正在学习人工智能课程。",
27
- "verdict": 0
28
- },
29
- {
30
- "statement": "约翰是一个勤奋的学生。",
31
- "reason": "上下文中提到他花费大量时间学习和完成作业。此外,还提到他经常在图书馆待到很晚以完成他的项目,这意味着他很勤奋。",
32
- "verdict": 1
33
- },
34
- {
35
- "statement": "约翰有一份兼职工作。",
36
- "reason": "上下文中没有给出约翰有兼职工作的信息。",
37
- "verdict": 0
38
- }
39
- ]
40
- }
41
- },
42
- {
43
- "input": {
44
- "context": "光合作用是植物、藻类和某些细菌用来将光能转化为化学能的过程。",
45
- "statements": [
46
- "阿尔伯特·爱因斯坦是个天才。"
47
- ]
48
- },
49
- "output": {
50
- "statements": [
51
- {
52
- "statement": "阿尔伯特·爱因斯坦是个天才。",
53
- "reason": "上下文和陈述无关",
54
- "verdict": 0
55
- }
56
- ]
57
- }
58
- }
59
- ]
60
- }
@@ -1,36 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -8546983388246528139,
4
- "language": "chinese",
5
- "instruction": "给定一个问题、一个答案和答案中的句子,分析在“句子”下给出的每个句子的复杂性,并将每个句子分解为一个或多个完全可理解的陈述,同时确保每个陈述中不使用代词。将输出格式化为JSON。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "阿尔伯特·爱因斯坦是谁,他最出名的是什么?",
10
- "answer": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最出名的是发展了相对论,他还为量子力学理论的发展做出了重要贡献。",
11
- "sentences": {
12
- "0": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
13
- "1": "他最出名的是发展了相对论,他还为量子力学理论的发展做出了重要贡献。"
14
- }
15
- },
16
- "output": {
17
- "sentences": [
18
- {
19
- "sentence_index": 0,
20
- "simpler_statements": [
21
- "阿尔伯特·爱因斯坦是一位出生于德国的理论物理学家。",
22
- "阿尔伯特·爱因斯坦被认为是有史以来最伟大和最有影响力的物理学家之一。"
23
- ]
24
- },
25
- {
26
- "sentence_index": 1,
27
- "simpler_statements": [
28
- "阿尔伯特·爱因斯坦最出名的是发展了相对论。",
29
- "阿尔伯特·爱因斯坦还为量子力学理论的发展做出了重要贡献。"
30
- ]
31
- }
32
- ]
33
- }
34
- }
35
- ]
36
- }