evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (105) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +10 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +23 -99
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +114 -85
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  26. evalscope/benchmarks/mmlu/__init__.py +0 -5
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  28. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  30. evalscope/benchmarks/race/__init__.py +0 -5
  31. evalscope/benchmarks/race/race_adapter.py +25 -53
  32. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  34. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  35. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  36. evalscope/collections/__init__.py +3 -0
  37. evalscope/collections/evaluator.py +178 -0
  38. evalscope/collections/sampler.py +132 -0
  39. evalscope/collections/schema.py +122 -0
  40. evalscope/config.py +10 -6
  41. evalscope/constants.py +7 -28
  42. evalscope/evaluator/evaluator.py +66 -108
  43. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  44. evalscope/metrics/__init__.py +6 -0
  45. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  46. evalscope/metrics/math_accuracy.py +193 -50
  47. evalscope/metrics/metrics.py +7 -4
  48. evalscope/metrics/rouge_metric.py +13 -8
  49. evalscope/models/__init__.py +14 -1
  50. evalscope/models/base_adapter.py +52 -0
  51. evalscope/models/chat_adapter.py +138 -0
  52. evalscope/models/choice_adapter.py +211 -0
  53. evalscope/models/custom_adapter.py +67 -0
  54. evalscope/models/local_model.py +74 -0
  55. evalscope/models/model.py +141 -0
  56. evalscope/models/server_adapter.py +104 -0
  57. evalscope/perf/arguments.py +1 -0
  58. evalscope/perf/benchmark.py +1 -1
  59. evalscope/perf/main.py +3 -1
  60. evalscope/perf/plugin/api/openai_api.py +51 -47
  61. evalscope/perf/utils/local_server.py +1 -0
  62. evalscope/run.py +37 -66
  63. evalscope/run_arena.py +1 -1
  64. evalscope/utils/__init__.py +1 -1
  65. evalscope/utils/chat_service.py +4 -3
  66. evalscope/utils/io_utils.py +8 -0
  67. evalscope/utils/logger.py +4 -0
  68. evalscope/utils/model_utils.py +10 -0
  69. evalscope/utils/utils.py +3 -25
  70. evalscope/version.py +2 -2
  71. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
  72. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
  73. tests/cli/test_collection.py +53 -0
  74. tests/cli/test_run.py +43 -1
  75. tests/perf/test_perf.py +3 -3
  76. tests/rag/test_mteb.py +3 -2
  77. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  78. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  79. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  80. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  81. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  82. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  83. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  84. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  85. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  86. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  87. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  88. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  89. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  90. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  91. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  92. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  93. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  94. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  95. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  96. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  97. evalscope/models/api/__init__.py +0 -3
  98. evalscope/models/dummy_chat_model.py +0 -49
  99. evalscope/models/model_adapter.py +0 -525
  100. evalscope/models/openai_model.py +0 -103
  101. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  102. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
  103. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
  104. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
  105. {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
- evalscope/__init__.py,sha256=RY0EjssSquqqsysRobElYm9Ix6E41uTXeaeh7lI7kqs,106
2
- evalscope/arguments.py,sha256=nozBnog45l77jxTFH_lyyJkj04ER3yyIpICepc2tC1Y,3783
3
- evalscope/config.py,sha256=ZDN0XVCCXMSSD675Smzm57fNDOx-cZTsNvPboMtYVow,8407
4
- evalscope/constants.py,sha256=M5qJ8b7kp-RF52IwBjx5EMjeuiH1e1jdollCsbIT-c4,3753
5
- evalscope/run.py,sha256=s_qE1ukrt4HBfRVAPJjC1XiqD9k7rSH7lX8yysyf5do,7279
6
- evalscope/run_arena.py,sha256=6nc_S8KL7B3V4SsnpIexfvczHN9kQwHR9R1GXb2sqgI,8586
1
+ evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
+ evalscope/arguments.py,sha256=v0oKhnJ-2RUpEWWKC_-e7Km5osgPJeZC_aKw8R-3Y0A,4382
3
+ evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
4
+ evalscope/constants.py,sha256=SAa5IEjcDvcH_ePvCcbValAEyMvGnXPdO0jDmKk8uUs,3277
5
+ evalscope/run.py,sha256=cFUwfsXDTQ8NGJYe314LDF_hnuM60UUQxzgbOcPRDbY,5619
6
+ evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
8
- evalscope/version.py,sha256=OXwZDg6ML1mbsIw-CBhWRf4zVz2ArW2PFzzLK9FVAZk,118
8
+ evalscope/version.py,sha256=zr0PUDVLPIYwSv10FsTbYbOSBc6BNKFH3cDqhMMp1Jg,118
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -22,7 +22,7 @@ evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y
22
22
  evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=2NQRvlYY2SOzvOOj9WRLyxvRlyj8CAcgbQqgsv-Xjgw,3929
23
23
  evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=CQnWZZTQ0FOzDtmGv7OF0W4Cv4g6u4_LQ93koDu1pes,2556
25
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=L0WYiy3Rgar0uMZRI-kz1qCEuUaFXwcsVj0CACG13ms,7439
25
+ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=NwpxNECN7NFgtlVdKY7vet5m-gAmIp8MJYka0eexWu0,7424
26
26
  evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0Uq7W0sPDBJS1rqp70KgSfeRQ3c7u8YeGhj5Yiu6rk,5646
27
27
  evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
28
28
  evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
@@ -42,26 +42,6 @@ evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3H
42
42
  evalscope/backend/rag_eval/ragas/arguments.py,sha256=8SYCV15d25ocdDHRqmGMQzd9zR6gwfOrVSFBe4T-KCo,1806
43
43
  evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
44
44
  evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
45
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=4wPfjNh-OVFQdvho3CAJ66_B2TZuRZVm6-xUIXokKcY,3935
46
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=wWidnp8726hf6-fY31ZoqCt9zhZgVM260o8MwdBI0d8,1737
47
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=o5RXPz-O1JM8gFRCLCY2iobh0uLc4mznT_zLCpWaPFE,968
48
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=eEs6gdAKuYfDohCz9EzM1o0ykIEUbvwoRu1Pd2dL92E,3168
49
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json,sha256=qZhHR9Ki374Ykb6V8dmptE1whXmPKRvAJ0Gl2akoaX0,216
50
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=k5LjoxcIDM9Yvj0h5bje6ANXEOgFbioRs1i23259Md8,2486
51
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=Pn1rGIjfyIeY6BZQEOeR4v-QC5xcmTN6aIh0G2E2Xuo,1740
52
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=p7RrFdNWY1Wo5s03SvtXQSZ-CEn96NkPZ3EHsJ3UIFE,1137
53
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=s2mlf9BTWnmnCZ9H3yLZgPvPUPWnPgIIDtRtH0qStMM,991
54
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=s_6K6surhTGpr5efryHjW-PFDKlYJTTpgXDlC_TbzVw,1943
55
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
56
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=p-vCyibNNezGcuID2kGvBDZJGdPXm3NvTTVvH6ij7N4,1973
57
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
58
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=yayuzrNO2EO9eIqSv5mthNTVXnw_7D_HOJZ_tse-qw0,1374
59
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=-rOBZuhZGbVrlti3PycavxAoInEry3dMYt9VN3Qvo-E,1475
60
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=svZ_xzfQp3KMzdVJoqTVPGnwgls2JjXXplTcUj1jVFo,767
61
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=VRO9Hy-e5Dba1AkLqxj2R-Ezwoby3BvipM9zNlZJ4GY,1328
62
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
63
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=1YVcklCc4otS0mkO0aiNNFx7Zecc1L3wB6ol3NPxTt0,697
64
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=c70_FGepQm3_dZngdjNudX_iCmu39tvZncyBqNxMrfg,658
65
45
  evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
66
46
  evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
67
47
  evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
@@ -70,19 +50,19 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_
70
50
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
51
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
72
52
  evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
73
- evalscope/backend/rag_eval/utils/llm.py,sha256=619eP8pXUcwIBaktBrGNA17j53j9jfg_1JeFDYzMCIE,2582
53
+ evalscope/backend/rag_eval/utils/llm.py,sha256=IaNgdQBnURAmtpK5UPDqfCNrtV_J3wu0s4JWQqKedHA,2568
74
54
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
75
55
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
76
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
77
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
78
- evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
79
- evalscope/benchmarks/benchmark.py,sha256=DnLgr__CzE4DICK3u3ZMeFY0sVktefmYh2Yql2swEhg,1796
80
- evalscope/benchmarks/data_adapter.py,sha256=hSW-tyTXxUPS_FnsMYAxxw9e4N7jS5eLiBHgCFAQNeo,10287
81
- evalscope/benchmarks/arc/__init__.py,sha256=9GBWGArac-s9igD8lnoEEKnpSQYNaHA8fVKonLimkrQ,360
58
+ evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
+ evalscope/benchmarks/benchmark.py,sha256=RuQEH5cQv4I9B1XxBZ0vAKTAfYZSUS9eK0o0RrMFVMA,2407
60
+ evalscope/benchmarks/data_adapter.py,sha256=-5Z_fdTRmkcXf1wnRuHgPrGVMKIl8Sq8RBTF9_HYo9A,12146
61
+ evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
82
62
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
83
- evalscope/benchmarks/arc/arc_adapter.py,sha256=3q74hZB9G3X0-pQPzBk_a8wZIedmIlDHZBb4aUaBGRA,9197
84
- evalscope/benchmarks/bbh/__init__.py,sha256=PcIMfTe4h5m-efBhnYQt6J-6O0qHFHGfuosRhk1Lhfo,303
85
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=UeNEEea5jqT7sYLpGGzvnxDdy6SrffM8H7gnVRpfGTw,10699
63
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=8xw01LNkx19J4BNN-D2SbzcA6GA_9nAVMH7WNPzBWXs,6661
64
+ evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
65
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=vpFy-05ubDwJ1IIsIV802_fWicgPJvq3uXtIneVhr48,8293
86
66
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
87
67
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
88
68
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -110,84 +90,91 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt
110
90
  evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
111
91
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
112
92
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
113
- evalscope/benchmarks/ceval/__init__.py,sha256=vBN_OgmcvKglYIu96nRoT2wD8FDdM3cRoTB-dqlmbLg,393
114
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1J_WquXRPw-pRHBiYn7ZxRVSjjvWDqRUJLa8nvT1vYk,15050
93
+ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
94
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=-qrzeXWC3dmF-mpJV-Gtz5PDIzCbWaLGdi5x1ha1ZC4,14347
115
95
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
116
96
  evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
117
- evalscope/benchmarks/cmmlu/__init__.py,sha256=9M_Lo5-ePaD6hWG-Y-_i-U79yTOKadtHPG7zFvekwN4,393
97
+ evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
118
98
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
119
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=malBAKNtDbfJ-kJoQUQTYYQ18MTJST63bgcsLiiktlw,13956
99
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=G1EnVVeYhycQ58a8PiXfYb3Pe4iEuf8ngHNJ4CUJz14,13311
120
100
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
121
- evalscope/benchmarks/competition_math/__init__.py,sha256=CDK03RXT-X21WcIAlkrCs0rCSiHe-yTY0nwM6-l75nI,465
101
+ evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
122
102
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
123
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=cHWJ6LLIWvftFXjGrOidMlZ1RGUFxPgDjs4wmBPSm1Y,18862
124
- evalscope/benchmarks/general_qa/__init__.py,sha256=N2t-ehNrl9eVAarlSgJvRapm9yOjhfCWhNPPfcUUy-s,409
125
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=cSW0Mq9__-gh-tVoVXD9Rk6h3h2iZW-Fu3RQ16haJhQ,5878
126
- evalscope/benchmarks/gsm8k/__init__.py,sha256=CtcG_QM8m5zmvMs2N53d7kcm4_hIgsO2qYPyx-71aLw,313
103
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=xAH3_EiJNhHO1iGTNC7CqTVOF-tpr-9o6Hj_DF5-gNg,6766
104
+ evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
105
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=F33qTr2LksJOkkR8VqFM4dwM1CKHSsdWfNrZ7w09z2Y,5650
106
+ evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
127
107
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
128
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=KBI9t5F7XW1Cs44QUA7ultkfsXxLyucH9zNYe-jOQQk,13866
129
- evalscope/benchmarks/hellaswag/__init__.py,sha256=cY1kluaTqC7AvyzwlQYc3BF_kB3LD1gOpg6i7RDr0cI,415
108
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=Qo-4fKHMFzSH5TEkc8NbciKOfP9ESY8CcGRV7dgjh7k,11212
109
+ evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
130
110
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
131
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=IIesSMPw1Yya4-LjqJt1QVkpOx8RGKwBYTQtmc0VfaQ,8495
132
- evalscope/benchmarks/humaneval/__init__.py,sha256=lqSlAf1-8Nzhc1j89sj6yAcaLt9pGhqu15M84bmzamc,333
111
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=Ea_LTREFtroil7D6EGxPT9-QxVGdot5ZhfixUqjuYqo,6046
112
+ evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
133
113
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
134
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=VAO7siedusq9z3b1J3ztFE4XDopYKqmwe2n-Numg7HY,9149
135
- evalscope/benchmarks/mmlu/__init__.py,sha256=OGiN1J80WDM72y242o7diYT9Rl-jkVEqTNntCl8Vt4M,385
114
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=iGxgOMVJTDAmJMmSzCmErLOwTMpPd11afoF5YgtvMJs,5224
115
+ evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
136
116
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
137
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8T-fN_Az0gWOyME9nHl3MvcD144TjWknFKcEOMHppAI,15494
117
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8hfAcTXN4c6I45GA8IhU1bJmQMTGJBXoEyaZEuR-ays,14761
138
118
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
139
- evalscope/benchmarks/race/__init__.py,sha256=HVda-CB-Q-N8RbwiVLADXYNY6VLUH-frJ8VCc3jm0Mk,385
119
+ evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=CYDfZTqn6qVwTE66PUpSt-RRqZHwXNZdykQr2QSECSY,4388
121
+ evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
140
122
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
141
- evalscope/benchmarks/race/race_adapter.py,sha256=WgnWYSctc3VtWm2FAeVDTlxR2hwXsF2tala7n66f5mw,9841
123
+ evalscope/benchmarks/race/race_adapter.py,sha256=1tLSb9nCvqCQ_6JjwiknFPD-L1E5pgvOBwZ-11G0JMU,9220
142
124
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
143
- evalscope/benchmarks/trivia_qa/__init__.py,sha256=eLMVC6tfwty5HqrQuGyWeAF2IhRNajWoO1SkLVemQj4,409
125
+ evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
144
126
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
145
127
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
146
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=YFatCVNM7I0YUttBznQMohmMkm3qxJpCSVxf6o_sgHk,7663
147
- evalscope/benchmarks/truthful_qa/__init__.py,sha256=EZOaHn13NS3ddHpS62ija8jz71SxOOsqcQRVg69e_Ho,429
128
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=pS8-uqNBqRIxTER8oVrLvu8kGJ9L3pvNCqCHZHiCPAc,5191
129
+ evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
148
130
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
149
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=XFnZXQJpHEe_xP_HImPHa8qrwojywnWAgeSaJAYB0oU,14916
131
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=UpzhcW7yCMv4GDzDKqL_y0KxeDkvbupuzoRh5qCsiys,14623
150
132
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
151
133
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
152
134
  evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
153
135
  evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
154
136
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
155
137
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
138
+ evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
139
+ evalscope/collections/evaluator.py,sha256=6bF7TtgHMWOSpuBzpuu9A40y9dNTxdI8vizC5-3LRhI,7404
140
+ evalscope/collections/sampler.py,sha256=psvciGq9lE_-EnJxR3l06SM7NC9XmDnRdu1ckH79kXI,4526
141
+ evalscope/collections/schema.py,sha256=Eq64Hr8GebsBsO_THixfrIWCioVCpr3LXsGXMaehui0,4055
156
142
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
157
- evalscope/evaluator/evaluator.py,sha256=nRR6aaa9J8nRfB8QPZwexSrfKDvPkPSGQpFVpbWLeW0,18380
143
+ evalscope/evaluator/evaluator.py,sha256=S3VWI6kFX4cJdsI1Px0-P1y4wmC_PoOqXMFeM3v-C74,16310
158
144
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
159
145
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
160
- evalscope/evaluator/reviewer/auto_reviewer.py,sha256=nL8k-i92L1iMwjPOnNxzQyZICfukZKJul4ZBvOWkHGw,16414
161
- evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
146
+ evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
147
+ evalscope/metrics/__init__.py,sha256=CnhvODaILc4X0dnBoSPuSbTE2WbSf5NEEzM2M9a6uII,434
162
148
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
163
- evalscope/metrics/math_accuracy.py,sha256=WqLfACuIeVFrX4q6_c2exnTLn2t10-rjv6sfxcqJJ14,1965
164
- evalscope/metrics/metrics.py,sha256=9Qj2KuSmaLOPhpGdBfiKGKVTIxHCuk0CPKI2b6L1zb8,12589
165
- evalscope/metrics/rouge_metric.py,sha256=oB-rBgMnavZSyOiAefg--OXdGfffKrET5bUmrx3nmx0,4408
149
+ evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
150
+ evalscope/metrics/metrics.py,sha256=XutNgiBAWACPZEIBSzylugDGFV4fDvo-qIYkxG7w2Mc,12634
151
+ evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
166
152
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
167
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=9YdE38duhBFsmFLkY7HXDCQqUNavB5Hh3kaB4WTjAII,11971
153
+ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
168
154
  evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
169
155
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
170
- evalscope/models/__init__.py,sha256=b-jXJ2Cj6dH8notAU7lvCVKbGrcEaf8Gfr5w79qNHAk,111
171
- evalscope/models/dummy_chat_model.py,sha256=aG3yolnnIN_-gsfF9FsyjyGMewQteEnUfOxTGScROSE,1272
172
- evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
173
- evalscope/models/model_adapter.py,sha256=XBeSFTR9pXmnhFWRRddcobnITC5T4JKooeFUeWEtUVI,19006
174
- evalscope/models/openai_model.py,sha256=-tPBu6v0Ogf_flmG88tFuu66QNKrOyxv3AjYwVtuR44,3313
175
- evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
176
- evalscope/models/api/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
156
+ evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
157
+ evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
158
+ evalscope/models/chat_adapter.py,sha256=P6CE0JqWDsE7afNfU_wicdisHLfc46Rw3rwTA0sEGQQ,5398
159
+ evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7yj25aI,7661
160
+ evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
161
+ evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
162
+ evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
163
+ evalscope/models/server_adapter.py,sha256=InS4M_LprbBV4xHcbPCm5y_S8-kApKDYhR-HEKXzG8Q,4169
177
164
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
178
165
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
179
166
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
180
- evalscope/perf/arguments.py,sha256=J067vNJF-RObJNZ0oE2RBIBNjliCYcflWtt6aGAt40g,9205
181
- evalscope/perf/benchmark.py,sha256=h151QXsVbg7lMe09aH_mxUdPRALIl1A35I9VO2zryEo,9615
167
+ evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
168
+ evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
182
169
  evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
183
- evalscope/perf/main.py,sha256=2GrE9wHibprzaw4gmcovdc5ods_EHwoSwwmkFDLTUjQ,1257
170
+ evalscope/perf/main.py,sha256=Qg99KhGUjnVAMkNofbDsvMGFxijewH8ri3QoW1y1U7U,1292
184
171
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
185
172
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
186
173
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
187
174
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
188
175
  evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
189
176
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
190
- evalscope/perf/plugin/api/openai_api.py,sha256=raa4SaatEphNfWuK6_3ecfe49Vg4yftD6C-enhufJuE,7020
177
+ evalscope/perf/plugin/api/openai_api.py,sha256=WV2EUIl1PTg-Dj7HMSxJrAE7OUxJZqQmZLJZLHffcJo,6805
191
178
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
192
179
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
193
180
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
@@ -201,7 +188,7 @@ evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0
201
188
  evalscope/perf/utils/benchmark_util.py,sha256=T_pXpSCwCNLJgfzgv3IO7kG61ghTLthVMsXZhBCGP_4,5541
202
189
  evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
203
190
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
204
- evalscope/perf/utils/local_server.py,sha256=A26gqBbxsnZA8CqQospyO50x3prVnD9XiT2l--ERxK0,4566
191
+ evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
205
192
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
206
193
  evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
207
194
  evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -239,6 +226,7 @@ evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=
239
226
  evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
240
227
  evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
241
228
  evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
229
+ evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
242
230
  evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
243
231
  evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
244
232
  evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
@@ -253,23 +241,24 @@ evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,5
253
241
  evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
254
242
  evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
255
243
  evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
256
- evalscope/utils/__init__.py,sha256=ZOri8VHx8LpJBJS90uw8h0Z7gPhtxhjWlBPWuuZgoRE,121
244
+ evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
257
245
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
258
- evalscope/utils/chat_service.py,sha256=VdNPXdFSf-4zxe0Ht74LBcdRNbpb9vzVi86HDEqfXHc,8647
246
+ evalscope/utils/chat_service.py,sha256=h6Z9CpgdmalD9u2WNxdfJw2MdzDqsMfDHmnNk8GkffY,8666
259
247
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
260
- evalscope/utils/io_utils.py,sha256=MnEi4llOYtXK81bUQ_XE_WP5qIsVrJ4MlKmWMH9vzFs,3993
261
- evalscope/utils/logger.py,sha256=4OGlkBsut_wzq-1UcM2DKQKdKs1FRNYGHw538TGvypU,3440
262
- evalscope/utils/model_utils.py,sha256=zMS1YRu4CzU4CVLZS6e_lgfHIDBqv3YBTJbPF1R2M90,443
263
- evalscope/utils/utils.py,sha256=lZl5lt4WqjoY5SEfsum8Sc-s_c9GSlmIZlkTAQkMnjE,10485
248
+ evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
249
+ evalscope/utils/logger.py,sha256=Cke17sVV9MrccINeuEsiVouJarDvS4Wt2JUaWK5NFLM,3582
250
+ evalscope/utils/model_utils.py,sha256=PqIu1nMhoD7sauZATkuxkPo4lrYTQRh8kleERrWD-Po,678
251
+ evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
264
252
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
265
253
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
266
254
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
267
- tests/cli/test_run.py,sha256=pMZvI3b0Vs-UFfciDoPwCYFAaYJzocQjxEaMLFTxYSo,4289
255
+ tests/cli/test_collection.py,sha256=pS-omRGU6yuvk5O5RPRIOklVKWKsV3lvPNvmk7rVIMY,2825
256
+ tests/cli/test_run.py,sha256=V5lxiqtuNcpbjewPaE3KD8ssuIolvhhIzYEU7iDXlZE,5492
268
257
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
269
- tests/perf/test_perf.py,sha256=AQB2QuMwJ1TnenHFPBF4YAtifbR0D0pSobP6xmDysqw,3023
258
+ tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
270
259
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
271
260
  tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
272
- tests/rag/test_mteb.py,sha256=CaEJ0f1M06Z90c72FQb9z23IC_KZtkURWsc_oRMgQn8,4609
261
+ tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
273
262
  tests/rag/test_ragas.py,sha256=N_mUBIyxdQ1REzjkoI2sBNluKLLmKatLc3VY1o9uPck,3947
274
263
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
275
264
  tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
@@ -277,9 +266,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
277
266
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
278
267
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
279
268
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
280
- evalscope-0.8.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
281
- evalscope-0.8.1.dist-info/METADATA,sha256=HydrEYb1OxbvVUMl11oLekV2sjvlgQQvtEpkcNAiW5A,23190
282
- evalscope-0.8.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
283
- evalscope-0.8.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
284
- evalscope-0.8.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
285
- evalscope-0.8.1.dist-info/RECORD,,
269
+ evalscope-0.9.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
270
+ evalscope-0.9.0.dist-info/METADATA,sha256=KbU5bo3jjt1FsaTVXvdRqJJQEgge_431xW3uQHYKawI,25136
271
+ evalscope-0.9.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
272
+ evalscope-0.9.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
273
+ evalscope-0.9.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
274
+ evalscope-0.9.0.dist-info/RECORD,,
@@ -0,0 +1,53 @@
1
+ import json
2
+ import unittest
3
+
4
+ from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
5
+ from evalscope.constants import EvalType
6
+ from evalscope.utils.io_utils import dump_jsonl_data
7
+ from evalscope.utils.utils import test_level_list
8
+
9
+
10
+ class TestCollection(unittest.TestCase):
11
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
12
+ def test_create_collection(self):
13
+ schema = CollectionSchema(name='math&reasoning', datasets=[
14
+ CollectionSchema(name='math', datasets=[
15
+ DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
16
+ DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
17
+ DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
18
+ DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
19
+ ]),
20
+ CollectionSchema(name='reasoning', datasets=[
21
+ DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
22
+ DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
23
+ DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
24
+ ]),
25
+ ])
26
+ print(schema.to_dict())
27
+ print(schema.flatten())
28
+ schema.dump_json('outputs/schema_test.json')
29
+
30
+
31
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
32
+ def test_generate_data(self):
33
+ schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
34
+ print(schema.to_dict())
35
+ mixed_data = WeightedSampler(schema, 100).sample()
36
+ dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
37
+
38
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
39
+ def test_evaluate_collection(self):
40
+ from evalscope import TaskConfig, run_task
41
+
42
+ task_cfg = TaskConfig(
43
+ model='qwen2.5',
44
+ api_url='http://127.0.0.1:8801/v1/chat/completions',
45
+ api_key='EMPTY',
46
+ eval_type=EvalType.SERVICE,
47
+ datasets=['data_collection'],
48
+ dataset_args={'data_collection': {
49
+ # 'local_path': 'outputs/mixed_data_test.jsonl'
50
+ 'local_path': 'outputs/weighted_mixed_data.jsonl'
51
+ }},
52
+ )
53
+ run_task(task_cfg=task_cfg)
tests/cli/test_run.py CHANGED
@@ -4,6 +4,7 @@ import subprocess
4
4
  import torch
5
5
  import unittest
6
6
 
7
+ from evalscope.constants import EvalType
7
8
  from evalscope.run import run_task
8
9
  from evalscope.utils import is_module_installed, test_level_list
9
10
  from evalscope.utils.logger import get_logger
@@ -70,7 +71,19 @@ class TestRun(unittest.TestCase):
70
71
 
71
72
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
72
73
  def test_run_task(self):
73
- task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'gsm8k', 'arc'], 'limit': 2, 'debug': False}
74
+ task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
75
+ 'datasets': [
76
+ 'mmlu_pro',
77
+ # 'bbh',
78
+ 'hellaswag',
79
+ # 'gsm8k',
80
+ # 'arc'
81
+ # 'race',
82
+ # 'truthful_qa',
83
+ # 'trivia_qa',
84
+ ],
85
+ 'limit': 20,
86
+ 'debug': True}
74
87
  run_task(task_cfg=task_cfg)
75
88
 
76
89
 
@@ -110,5 +123,34 @@ class TestRun(unittest.TestCase):
110
123
 
111
124
  run_task(task_cfg=task_cfg)
112
125
 
126
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
127
+ def test_run_server_model(self):
128
+ from evalscope.config import TaskConfig
129
+
130
+ task_cfg = TaskConfig(
131
+ model='qwen2.5',
132
+ api_url='http://127.0.0.1:8801/v1/chat/completions',
133
+ api_key='EMPTY',
134
+ eval_type=EvalType.SERVICE,
135
+ datasets=[
136
+ 'mmlu_pro',
137
+ # 'race',
138
+ # 'trivia_qa',
139
+ # 'cmmlu',
140
+ # 'humaneval',
141
+ # 'competition_math',
142
+ # 'gsm8k',
143
+ # 'arc',
144
+ # 'ceval',
145
+ # 'bbh',
146
+ # 'hellaswag',
147
+ ],
148
+ limit=2,
149
+ debug=True
150
+ )
151
+
152
+ run_task(task_cfg=task_cfg)
153
+
154
+
113
155
  if __name__ == '__main__':
114
156
  unittest.main()
tests/perf/test_perf.py CHANGED
@@ -19,13 +19,13 @@ class TestPerf(unittest.TestCase):
19
19
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
20
20
  def test_run_perf(self):
21
21
  task_cfg = {
22
- 'url': 'http://127.0.0.1:8000/v1/chat/completions',
22
+ 'url': 'http://127.0.0.1:8001/v1/chat/completions',
23
23
  'parallel': 1,
24
24
  'model': 'qwen2.5',
25
25
  'number': 15,
26
26
  'api': 'openai',
27
27
  'dataset': 'openqa',
28
- 'stream': True,
28
+ # 'stream': True,
29
29
  'debug': True,
30
30
  }
31
31
  run_perf_benchmark(task_cfg)
@@ -47,7 +47,7 @@ class TestPerf(unittest.TestCase):
47
47
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
48
48
  def test_run_perf_speed_benchmark(self):
49
49
  task_cfg = {
50
- 'url': 'http://127.0.0.1:8801/v1/completions',
50
+ 'url': 'http://127.0.0.1:8001/v1/completions',
51
51
  'parallel': 1,
52
52
  'model': 'qwen2.5',
53
53
  'api': 'openai',
tests/rag/test_mteb.py CHANGED
@@ -79,7 +79,7 @@ class TestMTEB(unittest.TestCase):
79
79
  },
80
80
  },
81
81
  {
82
- 'model_name_or_path': 'OpenBMB/MiniCPM-Reranker',
82
+ 'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
83
83
  'is_cross_encoder': True,
84
84
  'max_seq_length': 512,
85
85
  'prompt': '为这个问题生成一个检索用的表示',
@@ -94,7 +94,8 @@ class TestMTEB(unittest.TestCase):
94
94
  'verbosity': 2,
95
95
  'output_folder': 'outputs',
96
96
  'overwrite_results': True,
97
- 'limits': 10,
97
+ # 'limits': 10,
98
+ 'top_k': 10,
98
99
  },
99
100
  },
100
101
  }
@@ -1,87 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -492257975294377194,
4
- "language": "chinese",
5
- "instruction": "给定一个真实情况和一个答案陈述,分析每个陈述并将其分类为以下类别之一:TP(真正):答案中存在的陈述也直接由一个或多个真实情况中的陈述支持,FP(假正):答案中存在的陈述但没有被任何真实情况中的陈述直接支持,FN(假负):在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于其中一个类别。为每个分类提供理由。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "是什么为太阳提供能量,它的主要功能是什么?",
10
- "answer": [
11
- "太阳的能量来自核裂变,类似于地球上的核反应堆。",
12
- "太阳的主要功能是为太阳系提供光。"
13
- ],
14
- "ground_truth": [
15
- "太阳的能量来自核聚变,其中氢原子融合形成氦。",
16
- "太阳核心的这种聚变过程释放出巨大的能量。",
17
- "来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
18
- "太阳的光在地球的气候系统中起着关键作用。",
19
- "阳光有助于驱动天气和海洋洋流。"
20
- ]
21
- },
22
- "output": {
23
- "TP": [
24
- {
25
- "statement": "太阳的主要功能是为太阳系提供光。",
26
- "reason": "这一说法在某种程度上得到了地面事实的支持,提到太阳提供光和它的作用,尽管它更广泛地关注太阳的能量。"
27
- }
28
- ],
29
- "FP": [
30
- {
31
- "statement": "太阳的能量来自核裂变,类似于地球上的核反应堆。",
32
- "reason": "这一说法是不正确的,与地面事实相矛盾,地面事实指出太阳的能量来自核聚变。"
33
- }
34
- ],
35
- "FN": [
36
- {
37
- "statement": "太阳的能量来自核聚变,其中氢原子融合形成氦。",
38
- "reason": "这种对太阳能量来源的准确描述没有包含在答案中。"
39
- },
40
- {
41
- "statement": "太阳核心的这种聚变过程释放出巨大的能量。",
42
- "reason": "这个过程及其重要性没有在答案中提到。"
43
- },
44
- {
45
- "statement": "来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
46
- "reason": "答案中只提到了光,忽略了热量及其对生命的必要性,这些在地面事实中都有涵盖。"
47
- },
48
- {
49
- "statement": "太阳的光在地球的气候系统中起着关键作用。",
50
- "reason": "太阳光对地球气候系统的这种更广泛的影响没有在答案中提到。"
51
- },
52
- {
53
- "statement": "阳光有助于驱动天气和海洋洋流。",
54
- "reason": "答案中省略了阳光对天气模式和海洋洋流的影响。"
55
- }
56
- ]
57
- }
58
- },
59
- {
60
- "input": {
61
- "question": "水的沸点是多少?",
62
- "answer": [
63
- "水的沸点在海平面上是100摄氏度。"
64
- ],
65
- "ground_truth": [
66
- "水的沸点在海平面上是100摄氏度(212华氏度)。",
67
- "水的沸点会随着海拔的变化而变化。"
68
- ]
69
- },
70
- "output": {
71
- "TP": [
72
- {
73
- "statement": "水的沸点在海平面上是100摄氏度。",
74
- "reason": "这一说法直接得到了地面事实的支持,地面事实具体说明了水的沸点在海平面上是100摄氏度。"
75
- }
76
- ],
77
- "FP": [],
78
- "FN": [
79
- {
80
- "statement": "水的沸点会随着海拔的变化而变化。",
81
- "reason": "关于水的沸点如何随海拔变化的额外信息没有在答案中提到。"
82
- }
83
- ]
84
- }
85
- }
86
- ]
87
- }
@@ -1,36 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -8546983388246528139,
4
- "language": "chinese",
5
- "instruction": "给定一个问题、一个答案和答案中的句子,分析在“句子”下给出的每个句子的复杂性,并将每个句子分解为一个或多个完全可理解的陈述,同时确保每个陈述中不使用代词。将输出格式化为JSON。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "阿尔伯特·爱因斯坦是谁,他以什么而闻名?",
10
- "answer": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最著名的是发展了相对论,他还对量子力学理论的发展做出了重要贡献。",
11
- "sentences": {
12
- "0": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
13
- "1": "他最著名的是发展了相对论,他还对量子力学理论的发展做出了重要贡献。"
14
- }
15
- },
16
- "output": {
17
- "sentences": [
18
- {
19
- "sentence_index": 0,
20
- "simpler_statements": [
21
- "阿尔伯特·爱因斯坦是一位出生于德国的理论物理学家。",
22
- "阿尔伯特·爱因斯坦被认为是有史以来最伟大和最有影响力的物理学家之一。"
23
- ]
24
- },
25
- {
26
- "sentence_index": 1,
27
- "simpler_statements": [
28
- "阿尔伯特·爱因斯坦最著名的是发展了相对论。",
29
- "阿尔伯特·爱因斯坦还对量子力学理论的发展做出了重要贡献。"
30
- ]
31
- }
32
- ]
33
- }
34
- }
35
- ]
36
- }
@@ -1,26 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 7951911230338252816,
4
- "language": "chinese",
5
- "instruction": "为给定的答案生成一个问题,并识别答案是否含糊不清。如果答案含糊不清,则给出1;如果答案明确,则给出0。含糊不清的答案是指那些回避的、模糊的或不明确的答案。例如,“我不知道”或“我不确定”是含糊不清的答案。",
6
- "examples": [
7
- {
8
- "input": {
9
- "response": "阿尔伯特·爱因斯坦出生在德国。"
10
- },
11
- "output": {
12
- "question": "阿尔伯特·爱因斯坦出生在哪里?",
13
- "noncommittal": 0
14
- }
15
- },
16
- {
17
- "input": {
18
- "response": "我不知道2023年发明的智能手机的突破性功能,因为我对2022年以后的信息不了解。"
19
- },
20
- "output": {
21
- "question": "2023年发明的智能手机的突破性功能是什么?",
22
- "noncommittal": 1
23
- }
24
- }
25
- ]
26
- }