evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +10 -6
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -108
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/perf/arguments.py +1 -0
- evalscope/perf/benchmark.py +1 -1
- evalscope/perf/main.py +3 -1
- evalscope/perf/plugin/api/openai_api.py +51 -47
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/perf/test_perf.py +3 -3
- tests/rag/test_mteb.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
evalscope/__init__.py,sha256=
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
6
|
-
evalscope/run_arena.py,sha256=
|
|
1
|
+
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
+
evalscope/arguments.py,sha256=v0oKhnJ-2RUpEWWKC_-e7Km5osgPJeZC_aKw8R-3Y0A,4382
|
|
3
|
+
evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
|
|
4
|
+
evalscope/constants.py,sha256=SAa5IEjcDvcH_ePvCcbValAEyMvGnXPdO0jDmKk8uUs,3277
|
|
5
|
+
evalscope/run.py,sha256=cFUwfsXDTQ8NGJYe314LDF_hnuM60UUQxzgbOcPRDbY,5619
|
|
6
|
+
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=zr0PUDVLPIYwSv10FsTbYbOSBc6BNKFH3cDqhMMp1Jg,118
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -22,7 +22,7 @@ evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y
|
|
|
22
22
|
evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=2NQRvlYY2SOzvOOj9WRLyxvRlyj8CAcgbQqgsv-Xjgw,3929
|
|
23
23
|
evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=CQnWZZTQ0FOzDtmGv7OF0W4Cv4g6u4_LQ93koDu1pes,2556
|
|
25
|
-
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=
|
|
25
|
+
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=NwpxNECN7NFgtlVdKY7vet5m-gAmIp8MJYka0eexWu0,7424
|
|
26
26
|
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0Uq7W0sPDBJS1rqp70KgSfeRQ3c7u8YeGhj5Yiu6rk,5646
|
|
27
27
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
|
|
28
28
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
|
|
@@ -42,26 +42,6 @@ evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3H
|
|
|
42
42
|
evalscope/backend/rag_eval/ragas/arguments.py,sha256=8SYCV15d25ocdDHRqmGMQzd9zR6gwfOrVSFBe4T-KCo,1806
|
|
43
43
|
evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
|
|
44
44
|
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
|
|
45
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=4wPfjNh-OVFQdvho3CAJ66_B2TZuRZVm6-xUIXokKcY,3935
|
|
46
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=wWidnp8726hf6-fY31ZoqCt9zhZgVM260o8MwdBI0d8,1737
|
|
47
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=o5RXPz-O1JM8gFRCLCY2iobh0uLc4mznT_zLCpWaPFE,968
|
|
48
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=eEs6gdAKuYfDohCz9EzM1o0ykIEUbvwoRu1Pd2dL92E,3168
|
|
49
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json,sha256=qZhHR9Ki374Ykb6V8dmptE1whXmPKRvAJ0Gl2akoaX0,216
|
|
50
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=k5LjoxcIDM9Yvj0h5bje6ANXEOgFbioRs1i23259Md8,2486
|
|
51
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=Pn1rGIjfyIeY6BZQEOeR4v-QC5xcmTN6aIh0G2E2Xuo,1740
|
|
52
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=p7RrFdNWY1Wo5s03SvtXQSZ-CEn96NkPZ3EHsJ3UIFE,1137
|
|
53
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=s2mlf9BTWnmnCZ9H3yLZgPvPUPWnPgIIDtRtH0qStMM,991
|
|
54
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=s_6K6surhTGpr5efryHjW-PFDKlYJTTpgXDlC_TbzVw,1943
|
|
55
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
|
|
56
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=p-vCyibNNezGcuID2kGvBDZJGdPXm3NvTTVvH6ij7N4,1973
|
|
57
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
|
|
58
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=yayuzrNO2EO9eIqSv5mthNTVXnw_7D_HOJZ_tse-qw0,1374
|
|
59
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=-rOBZuhZGbVrlti3PycavxAoInEry3dMYt9VN3Qvo-E,1475
|
|
60
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=svZ_xzfQp3KMzdVJoqTVPGnwgls2JjXXplTcUj1jVFo,767
|
|
61
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=VRO9Hy-e5Dba1AkLqxj2R-Ezwoby3BvipM9zNlZJ4GY,1328
|
|
62
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
|
|
63
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=1YVcklCc4otS0mkO0aiNNFx7Zecc1L3wB6ol3NPxTt0,697
|
|
64
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=c70_FGepQm3_dZngdjNudX_iCmu39tvZncyBqNxMrfg,658
|
|
65
45
|
evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
|
|
66
46
|
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
|
|
67
47
|
evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
|
|
@@ -70,19 +50,19 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_
|
|
|
70
50
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
51
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
72
52
|
evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
|
|
73
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
53
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=IaNgdQBnURAmtpK5UPDqfCNrtV_J3wu0s4JWQqKedHA,2568
|
|
74
54
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
75
55
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
76
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
77
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
78
|
-
evalscope/benchmarks/__init__.py,sha256=
|
|
79
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
80
|
-
evalscope/benchmarks/data_adapter.py,sha256
|
|
81
|
-
evalscope/benchmarks/arc/__init__.py,sha256=
|
|
58
|
+
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=RuQEH5cQv4I9B1XxBZ0vAKTAfYZSUS9eK0o0RrMFVMA,2407
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=-5Z_fdTRmkcXf1wnRuHgPrGVMKIl8Sq8RBTF9_HYo9A,12146
|
|
61
|
+
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
82
62
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
83
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
84
|
-
evalscope/benchmarks/bbh/__init__.py,sha256=
|
|
85
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
63
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=8xw01LNkx19J4BNN-D2SbzcA6GA_9nAVMH7WNPzBWXs,6661
|
|
64
|
+
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
65
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=vpFy-05ubDwJ1IIsIV802_fWicgPJvq3uXtIneVhr48,8293
|
|
86
66
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
87
67
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
88
68
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -110,84 +90,91 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt
|
|
|
110
90
|
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
|
|
111
91
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
112
92
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
113
|
-
evalscope/benchmarks/ceval/__init__.py,sha256=
|
|
114
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256
|
|
93
|
+
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
94
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=-qrzeXWC3dmF-mpJV-Gtz5PDIzCbWaLGdi5x1ha1ZC4,14347
|
|
115
95
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
116
96
|
evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
|
|
117
|
-
evalscope/benchmarks/cmmlu/__init__.py,sha256=
|
|
97
|
+
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
118
98
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
119
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
99
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=G1EnVVeYhycQ58a8PiXfYb3Pe4iEuf8ngHNJ4CUJz14,13311
|
|
120
100
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
121
|
-
evalscope/benchmarks/competition_math/__init__.py,sha256=
|
|
101
|
+
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
122
102
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
123
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
124
|
-
evalscope/benchmarks/general_qa/__init__.py,sha256=
|
|
125
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
126
|
-
evalscope/benchmarks/gsm8k/__init__.py,sha256=
|
|
103
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=xAH3_EiJNhHO1iGTNC7CqTVOF-tpr-9o6Hj_DF5-gNg,6766
|
|
104
|
+
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
105
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=F33qTr2LksJOkkR8VqFM4dwM1CKHSsdWfNrZ7w09z2Y,5650
|
|
106
|
+
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
127
107
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
128
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
129
|
-
evalscope/benchmarks/hellaswag/__init__.py,sha256=
|
|
108
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=Qo-4fKHMFzSH5TEkc8NbciKOfP9ESY8CcGRV7dgjh7k,11212
|
|
109
|
+
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
130
110
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
131
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
132
|
-
evalscope/benchmarks/humaneval/__init__.py,sha256=
|
|
111
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=Ea_LTREFtroil7D6EGxPT9-QxVGdot5ZhfixUqjuYqo,6046
|
|
112
|
+
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
133
113
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
134
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
135
|
-
evalscope/benchmarks/mmlu/__init__.py,sha256=
|
|
114
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=iGxgOMVJTDAmJMmSzCmErLOwTMpPd11afoF5YgtvMJs,5224
|
|
115
|
+
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
136
116
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
137
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
117
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8hfAcTXN4c6I45GA8IhU1bJmQMTGJBXoEyaZEuR-ays,14761
|
|
138
118
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
139
|
-
evalscope/benchmarks/
|
|
119
|
+
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
120
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=CYDfZTqn6qVwTE66PUpSt-RRqZHwXNZdykQr2QSECSY,4388
|
|
121
|
+
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
140
122
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
141
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
123
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=1tLSb9nCvqCQ_6JjwiknFPD-L1E5pgvOBwZ-11G0JMU,9220
|
|
142
124
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
143
|
-
evalscope/benchmarks/trivia_qa/__init__.py,sha256=
|
|
125
|
+
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
144
126
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
145
127
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
146
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
147
|
-
evalscope/benchmarks/truthful_qa/__init__.py,sha256=
|
|
128
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=pS8-uqNBqRIxTER8oVrLvu8kGJ9L3pvNCqCHZHiCPAc,5191
|
|
129
|
+
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
148
130
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
149
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
131
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=UpzhcW7yCMv4GDzDKqL_y0KxeDkvbupuzoRh5qCsiys,14623
|
|
150
132
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
151
133
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
152
134
|
evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
|
|
153
135
|
evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
|
|
154
136
|
evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
|
|
155
137
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
138
|
+
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
139
|
+
evalscope/collections/evaluator.py,sha256=6bF7TtgHMWOSpuBzpuu9A40y9dNTxdI8vizC5-3LRhI,7404
|
|
140
|
+
evalscope/collections/sampler.py,sha256=psvciGq9lE_-EnJxR3l06SM7NC9XmDnRdu1ckH79kXI,4526
|
|
141
|
+
evalscope/collections/schema.py,sha256=Eq64Hr8GebsBsO_THixfrIWCioVCpr3LXsGXMaehui0,4055
|
|
156
142
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
157
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
143
|
+
evalscope/evaluator/evaluator.py,sha256=S3VWI6kFX4cJdsI1Px0-P1y4wmC_PoOqXMFeM3v-C74,16310
|
|
158
144
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
159
145
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
160
|
-
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=
|
|
161
|
-
evalscope/metrics/__init__.py,sha256=
|
|
146
|
+
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
147
|
+
evalscope/metrics/__init__.py,sha256=CnhvODaILc4X0dnBoSPuSbTE2WbSf5NEEzM2M9a6uII,434
|
|
162
148
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
163
|
-
evalscope/metrics/math_accuracy.py,sha256=
|
|
164
|
-
evalscope/metrics/metrics.py,sha256=
|
|
165
|
-
evalscope/metrics/rouge_metric.py,sha256=
|
|
149
|
+
evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
|
|
150
|
+
evalscope/metrics/metrics.py,sha256=XutNgiBAWACPZEIBSzylugDGFV4fDvo-qIYkxG7w2Mc,12634
|
|
151
|
+
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
166
152
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
167
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
153
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
|
|
168
154
|
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
|
|
169
155
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
170
|
-
evalscope/models/__init__.py,sha256=
|
|
171
|
-
evalscope/models/
|
|
172
|
-
evalscope/models/
|
|
173
|
-
evalscope/models/
|
|
174
|
-
evalscope/models/
|
|
175
|
-
evalscope/models/
|
|
176
|
-
evalscope/models/
|
|
156
|
+
evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
|
|
157
|
+
evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
|
|
158
|
+
evalscope/models/chat_adapter.py,sha256=P6CE0JqWDsE7afNfU_wicdisHLfc46Rw3rwTA0sEGQQ,5398
|
|
159
|
+
evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7yj25aI,7661
|
|
160
|
+
evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
|
|
161
|
+
evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
|
|
162
|
+
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
163
|
+
evalscope/models/server_adapter.py,sha256=InS4M_LprbBV4xHcbPCm5y_S8-kApKDYhR-HEKXzG8Q,4169
|
|
177
164
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
178
165
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
179
166
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
180
|
-
evalscope/perf/arguments.py,sha256=
|
|
181
|
-
evalscope/perf/benchmark.py,sha256=
|
|
167
|
+
evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
|
|
168
|
+
evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
|
|
182
169
|
evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
|
|
183
|
-
evalscope/perf/main.py,sha256=
|
|
170
|
+
evalscope/perf/main.py,sha256=Qg99KhGUjnVAMkNofbDsvMGFxijewH8ri3QoW1y1U7U,1292
|
|
184
171
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
185
172
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
186
173
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
187
174
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
188
175
|
evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
|
|
189
176
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
190
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
177
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=WV2EUIl1PTg-Dj7HMSxJrAE7OUxJZqQmZLJZLHffcJo,6805
|
|
191
178
|
evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
|
|
192
179
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
193
180
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
@@ -201,7 +188,7 @@ evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0
|
|
|
201
188
|
evalscope/perf/utils/benchmark_util.py,sha256=T_pXpSCwCNLJgfzgv3IO7kG61ghTLthVMsXZhBCGP_4,5541
|
|
202
189
|
evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
|
|
203
190
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
204
|
-
evalscope/perf/utils/local_server.py,sha256=
|
|
191
|
+
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
205
192
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
206
193
|
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
207
194
|
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
@@ -239,6 +226,7 @@ evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=
|
|
|
239
226
|
evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
|
|
240
227
|
evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
241
228
|
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
|
|
229
|
+
evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
|
|
242
230
|
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
243
231
|
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
244
232
|
evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
|
|
@@ -253,23 +241,24 @@ evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,5
|
|
|
253
241
|
evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
|
|
254
242
|
evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
|
|
255
243
|
evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
256
|
-
evalscope/utils/__init__.py,sha256=
|
|
244
|
+
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
257
245
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
258
|
-
evalscope/utils/chat_service.py,sha256=
|
|
246
|
+
evalscope/utils/chat_service.py,sha256=h6Z9CpgdmalD9u2WNxdfJw2MdzDqsMfDHmnNk8GkffY,8666
|
|
259
247
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
260
|
-
evalscope/utils/io_utils.py,sha256=
|
|
261
|
-
evalscope/utils/logger.py,sha256=
|
|
262
|
-
evalscope/utils/model_utils.py,sha256=
|
|
263
|
-
evalscope/utils/utils.py,sha256=
|
|
248
|
+
evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
|
|
249
|
+
evalscope/utils/logger.py,sha256=Cke17sVV9MrccINeuEsiVouJarDvS4Wt2JUaWK5NFLM,3582
|
|
250
|
+
evalscope/utils/model_utils.py,sha256=PqIu1nMhoD7sauZATkuxkPo4lrYTQRh8kleERrWD-Po,678
|
|
251
|
+
evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
|
|
264
252
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
265
253
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
266
254
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
267
|
-
tests/cli/
|
|
255
|
+
tests/cli/test_collection.py,sha256=pS-omRGU6yuvk5O5RPRIOklVKWKsV3lvPNvmk7rVIMY,2825
|
|
256
|
+
tests/cli/test_run.py,sha256=V5lxiqtuNcpbjewPaE3KD8ssuIolvhhIzYEU7iDXlZE,5492
|
|
268
257
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
269
|
-
tests/perf/test_perf.py,sha256=
|
|
258
|
+
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
270
259
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
271
260
|
tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
|
|
272
|
-
tests/rag/test_mteb.py,sha256=
|
|
261
|
+
tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
|
|
273
262
|
tests/rag/test_ragas.py,sha256=N_mUBIyxdQ1REzjkoI2sBNluKLLmKatLc3VY1o9uPck,3947
|
|
274
263
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
275
264
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
@@ -277,9 +266,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
277
266
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
278
267
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
279
268
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
280
|
-
evalscope-0.
|
|
281
|
-
evalscope-0.
|
|
282
|
-
evalscope-0.
|
|
283
|
-
evalscope-0.
|
|
284
|
-
evalscope-0.
|
|
285
|
-
evalscope-0.
|
|
269
|
+
evalscope-0.9.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
270
|
+
evalscope-0.9.0.dist-info/METADATA,sha256=KbU5bo3jjt1FsaTVXvdRqJJQEgge_431xW3uQHYKawI,25136
|
|
271
|
+
evalscope-0.9.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
272
|
+
evalscope-0.9.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
273
|
+
evalscope-0.9.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
274
|
+
evalscope-0.9.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import unittest
|
|
3
|
+
|
|
4
|
+
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
|
5
|
+
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.utils.io_utils import dump_jsonl_data
|
|
7
|
+
from evalscope.utils.utils import test_level_list
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestCollection(unittest.TestCase):
|
|
11
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
12
|
+
def test_create_collection(self):
|
|
13
|
+
schema = CollectionSchema(name='math&reasoning', datasets=[
|
|
14
|
+
CollectionSchema(name='math', datasets=[
|
|
15
|
+
DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
|
|
16
|
+
DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
|
|
17
|
+
DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
|
|
18
|
+
DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
|
|
19
|
+
]),
|
|
20
|
+
CollectionSchema(name='reasoning', datasets=[
|
|
21
|
+
DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
|
|
22
|
+
DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
|
|
23
|
+
DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
|
|
24
|
+
]),
|
|
25
|
+
])
|
|
26
|
+
print(schema.to_dict())
|
|
27
|
+
print(schema.flatten())
|
|
28
|
+
schema.dump_json('outputs/schema_test.json')
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
32
|
+
def test_generate_data(self):
|
|
33
|
+
schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
|
|
34
|
+
print(schema.to_dict())
|
|
35
|
+
mixed_data = WeightedSampler(schema, 100).sample()
|
|
36
|
+
dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
|
|
37
|
+
|
|
38
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
39
|
+
def test_evaluate_collection(self):
|
|
40
|
+
from evalscope import TaskConfig, run_task
|
|
41
|
+
|
|
42
|
+
task_cfg = TaskConfig(
|
|
43
|
+
model='qwen2.5',
|
|
44
|
+
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
45
|
+
api_key='EMPTY',
|
|
46
|
+
eval_type=EvalType.SERVICE,
|
|
47
|
+
datasets=['data_collection'],
|
|
48
|
+
dataset_args={'data_collection': {
|
|
49
|
+
# 'local_path': 'outputs/mixed_data_test.jsonl'
|
|
50
|
+
'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
51
|
+
}},
|
|
52
|
+
)
|
|
53
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_run.py
CHANGED
|
@@ -4,6 +4,7 @@ import subprocess
|
|
|
4
4
|
import torch
|
|
5
5
|
import unittest
|
|
6
6
|
|
|
7
|
+
from evalscope.constants import EvalType
|
|
7
8
|
from evalscope.run import run_task
|
|
8
9
|
from evalscope.utils import is_module_installed, test_level_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
@@ -70,7 +71,19 @@ class TestRun(unittest.TestCase):
|
|
|
70
71
|
|
|
71
72
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
72
73
|
def test_run_task(self):
|
|
73
|
-
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
|
|
74
|
+
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
|
|
75
|
+
'datasets': [
|
|
76
|
+
'mmlu_pro',
|
|
77
|
+
# 'bbh',
|
|
78
|
+
'hellaswag',
|
|
79
|
+
# 'gsm8k',
|
|
80
|
+
# 'arc'
|
|
81
|
+
# 'race',
|
|
82
|
+
# 'truthful_qa',
|
|
83
|
+
# 'trivia_qa',
|
|
84
|
+
],
|
|
85
|
+
'limit': 20,
|
|
86
|
+
'debug': True}
|
|
74
87
|
run_task(task_cfg=task_cfg)
|
|
75
88
|
|
|
76
89
|
|
|
@@ -110,5 +123,34 @@ class TestRun(unittest.TestCase):
|
|
|
110
123
|
|
|
111
124
|
run_task(task_cfg=task_cfg)
|
|
112
125
|
|
|
126
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
127
|
+
def test_run_server_model(self):
|
|
128
|
+
from evalscope.config import TaskConfig
|
|
129
|
+
|
|
130
|
+
task_cfg = TaskConfig(
|
|
131
|
+
model='qwen2.5',
|
|
132
|
+
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
133
|
+
api_key='EMPTY',
|
|
134
|
+
eval_type=EvalType.SERVICE,
|
|
135
|
+
datasets=[
|
|
136
|
+
'mmlu_pro',
|
|
137
|
+
# 'race',
|
|
138
|
+
# 'trivia_qa',
|
|
139
|
+
# 'cmmlu',
|
|
140
|
+
# 'humaneval',
|
|
141
|
+
# 'competition_math',
|
|
142
|
+
# 'gsm8k',
|
|
143
|
+
# 'arc',
|
|
144
|
+
# 'ceval',
|
|
145
|
+
# 'bbh',
|
|
146
|
+
# 'hellaswag',
|
|
147
|
+
],
|
|
148
|
+
limit=2,
|
|
149
|
+
debug=True
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
run_task(task_cfg=task_cfg)
|
|
153
|
+
|
|
154
|
+
|
|
113
155
|
if __name__ == '__main__':
|
|
114
156
|
unittest.main()
|
tests/perf/test_perf.py
CHANGED
|
@@ -19,13 +19,13 @@ class TestPerf(unittest.TestCase):
|
|
|
19
19
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
20
20
|
def test_run_perf(self):
|
|
21
21
|
task_cfg = {
|
|
22
|
-
'url': 'http://127.0.0.1:
|
|
22
|
+
'url': 'http://127.0.0.1:8001/v1/chat/completions',
|
|
23
23
|
'parallel': 1,
|
|
24
24
|
'model': 'qwen2.5',
|
|
25
25
|
'number': 15,
|
|
26
26
|
'api': 'openai',
|
|
27
27
|
'dataset': 'openqa',
|
|
28
|
-
'stream': True,
|
|
28
|
+
# 'stream': True,
|
|
29
29
|
'debug': True,
|
|
30
30
|
}
|
|
31
31
|
run_perf_benchmark(task_cfg)
|
|
@@ -47,7 +47,7 @@ class TestPerf(unittest.TestCase):
|
|
|
47
47
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
48
48
|
def test_run_perf_speed_benchmark(self):
|
|
49
49
|
task_cfg = {
|
|
50
|
-
'url': 'http://127.0.0.1:
|
|
50
|
+
'url': 'http://127.0.0.1:8001/v1/completions',
|
|
51
51
|
'parallel': 1,
|
|
52
52
|
'model': 'qwen2.5',
|
|
53
53
|
'api': 'openai',
|
tests/rag/test_mteb.py
CHANGED
|
@@ -79,7 +79,7 @@ class TestMTEB(unittest.TestCase):
|
|
|
79
79
|
},
|
|
80
80
|
},
|
|
81
81
|
{
|
|
82
|
-
'model_name_or_path': '
|
|
82
|
+
'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
|
|
83
83
|
'is_cross_encoder': True,
|
|
84
84
|
'max_seq_length': 512,
|
|
85
85
|
'prompt': '为这个问题生成一个检索用的表示',
|
|
@@ -94,7 +94,8 @@ class TestMTEB(unittest.TestCase):
|
|
|
94
94
|
'verbosity': 2,
|
|
95
95
|
'output_folder': 'outputs',
|
|
96
96
|
'overwrite_results': True,
|
|
97
|
-
'limits': 10,
|
|
97
|
+
# 'limits': 10,
|
|
98
|
+
'top_k': 10,
|
|
98
99
|
},
|
|
99
100
|
},
|
|
100
101
|
}
|
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"ragas_version": "0.2.7",
|
|
3
|
-
"original_hash": -492257975294377194,
|
|
4
|
-
"language": "chinese",
|
|
5
|
-
"instruction": "给定一个真实情况和一个答案陈述,分析每个陈述并将其分类为以下类别之一:TP(真正):答案中存在的陈述也直接由一个或多个真实情况中的陈述支持,FP(假正):答案中存在的陈述但没有被任何真实情况中的陈述直接支持,FN(假负):在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于其中一个类别。为每个分类提供理由。",
|
|
6
|
-
"examples": [
|
|
7
|
-
{
|
|
8
|
-
"input": {
|
|
9
|
-
"question": "是什么为太阳提供能量,它的主要功能是什么?",
|
|
10
|
-
"answer": [
|
|
11
|
-
"太阳的能量来自核裂变,类似于地球上的核反应堆。",
|
|
12
|
-
"太阳的主要功能是为太阳系提供光。"
|
|
13
|
-
],
|
|
14
|
-
"ground_truth": [
|
|
15
|
-
"太阳的能量来自核聚变,其中氢原子融合形成氦。",
|
|
16
|
-
"太阳核心的这种聚变过程释放出巨大的能量。",
|
|
17
|
-
"来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
|
|
18
|
-
"太阳的光在地球的气候系统中起着关键作用。",
|
|
19
|
-
"阳光有助于驱动天气和海洋洋流。"
|
|
20
|
-
]
|
|
21
|
-
},
|
|
22
|
-
"output": {
|
|
23
|
-
"TP": [
|
|
24
|
-
{
|
|
25
|
-
"statement": "太阳的主要功能是为太阳系提供光。",
|
|
26
|
-
"reason": "这一说法在某种程度上得到了地面事实的支持,提到太阳提供光和它的作用,尽管它更广泛地关注太阳的能量。"
|
|
27
|
-
}
|
|
28
|
-
],
|
|
29
|
-
"FP": [
|
|
30
|
-
{
|
|
31
|
-
"statement": "太阳的能量来自核裂变,类似于地球上的核反应堆。",
|
|
32
|
-
"reason": "这一说法是不正确的,与地面事实相矛盾,地面事实指出太阳的能量来自核聚变。"
|
|
33
|
-
}
|
|
34
|
-
],
|
|
35
|
-
"FN": [
|
|
36
|
-
{
|
|
37
|
-
"statement": "太阳的能量来自核聚变,其中氢原子融合形成氦。",
|
|
38
|
-
"reason": "这种对太阳能量来源的准确描述没有包含在答案中。"
|
|
39
|
-
},
|
|
40
|
-
{
|
|
41
|
-
"statement": "太阳核心的这种聚变过程释放出巨大的能量。",
|
|
42
|
-
"reason": "这个过程及其重要性没有在答案中提到。"
|
|
43
|
-
},
|
|
44
|
-
{
|
|
45
|
-
"statement": "来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
|
|
46
|
-
"reason": "答案中只提到了光,忽略了热量及其对生命的必要性,这些在地面事实中都有涵盖。"
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
"statement": "太阳的光在地球的气候系统中起着关键作用。",
|
|
50
|
-
"reason": "太阳光对地球气候系统的这种更广泛的影响没有在答案中提到。"
|
|
51
|
-
},
|
|
52
|
-
{
|
|
53
|
-
"statement": "阳光有助于驱动天气和海洋洋流。",
|
|
54
|
-
"reason": "答案中省略了阳光对天气模式和海洋洋流的影响。"
|
|
55
|
-
}
|
|
56
|
-
]
|
|
57
|
-
}
|
|
58
|
-
},
|
|
59
|
-
{
|
|
60
|
-
"input": {
|
|
61
|
-
"question": "水的沸点是多少?",
|
|
62
|
-
"answer": [
|
|
63
|
-
"水的沸点在海平面上是100摄氏度。"
|
|
64
|
-
],
|
|
65
|
-
"ground_truth": [
|
|
66
|
-
"水的沸点在海平面上是100摄氏度(212华氏度)。",
|
|
67
|
-
"水的沸点会随着海拔的变化而变化。"
|
|
68
|
-
]
|
|
69
|
-
},
|
|
70
|
-
"output": {
|
|
71
|
-
"TP": [
|
|
72
|
-
{
|
|
73
|
-
"statement": "水的沸点在海平面上是100摄氏度。",
|
|
74
|
-
"reason": "这一说法直接得到了地面事实的支持,地面事实具体说明了水的沸点在海平面上是100摄氏度。"
|
|
75
|
-
}
|
|
76
|
-
],
|
|
77
|
-
"FP": [],
|
|
78
|
-
"FN": [
|
|
79
|
-
{
|
|
80
|
-
"statement": "水的沸点会随着海拔的变化而变化。",
|
|
81
|
-
"reason": "关于水的沸点如何随海拔变化的额外信息没有在答案中提到。"
|
|
82
|
-
}
|
|
83
|
-
]
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
]
|
|
87
|
-
}
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"ragas_version": "0.2.7",
|
|
3
|
-
"original_hash": -8546983388246528139,
|
|
4
|
-
"language": "chinese",
|
|
5
|
-
"instruction": "给定一个问题、一个答案和答案中的句子,分析在“句子”下给出的每个句子的复杂性,并将每个句子分解为一个或多个完全可理解的陈述,同时确保每个陈述中不使用代词。将输出格式化为JSON。",
|
|
6
|
-
"examples": [
|
|
7
|
-
{
|
|
8
|
-
"input": {
|
|
9
|
-
"question": "阿尔伯特·爱因斯坦是谁,他以什么而闻名?",
|
|
10
|
-
"answer": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最著名的是发展了相对论,他还对量子力学理论的发展做出了重要贡献。",
|
|
11
|
-
"sentences": {
|
|
12
|
-
"0": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
|
|
13
|
-
"1": "他最著名的是发展了相对论,他还对量子力学理论的发展做出了重要贡献。"
|
|
14
|
-
}
|
|
15
|
-
},
|
|
16
|
-
"output": {
|
|
17
|
-
"sentences": [
|
|
18
|
-
{
|
|
19
|
-
"sentence_index": 0,
|
|
20
|
-
"simpler_statements": [
|
|
21
|
-
"阿尔伯特·爱因斯坦是一位出生于德国的理论物理学家。",
|
|
22
|
-
"阿尔伯特·爱因斯坦被认为是有史以来最伟大和最有影响力的物理学家之一。"
|
|
23
|
-
]
|
|
24
|
-
},
|
|
25
|
-
{
|
|
26
|
-
"sentence_index": 1,
|
|
27
|
-
"simpler_statements": [
|
|
28
|
-
"阿尔伯特·爱因斯坦最著名的是发展了相对论。",
|
|
29
|
-
"阿尔伯特·爱因斯坦还对量子力学理论的发展做出了重要贡献。"
|
|
30
|
-
]
|
|
31
|
-
}
|
|
32
|
-
]
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
]
|
|
36
|
-
}
|
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json
DELETED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"ragas_version": "0.2.7",
|
|
3
|
-
"original_hash": 7951911230338252816,
|
|
4
|
-
"language": "chinese",
|
|
5
|
-
"instruction": "为给定的答案生成一个问题,并识别答案是否含糊不清。如果答案含糊不清,则给出1;如果答案明确,则给出0。含糊不清的答案是指那些回避的、模糊的或不明确的答案。例如,“我不知道”或“我不确定”是含糊不清的答案。",
|
|
6
|
-
"examples": [
|
|
7
|
-
{
|
|
8
|
-
"input": {
|
|
9
|
-
"response": "阿尔伯特·爱因斯坦出生在德国。"
|
|
10
|
-
},
|
|
11
|
-
"output": {
|
|
12
|
-
"question": "阿尔伯特·爱因斯坦出生在哪里?",
|
|
13
|
-
"noncommittal": 0
|
|
14
|
-
}
|
|
15
|
-
},
|
|
16
|
-
{
|
|
17
|
-
"input": {
|
|
18
|
-
"response": "我不知道2023年发明的智能手机的突破性功能,因为我对2022年以后的信息不了解。"
|
|
19
|
-
},
|
|
20
|
-
"output": {
|
|
21
|
-
"question": "2023年发明的智能手机的突破性功能是什么?",
|
|
22
|
-
"noncommittal": 1
|
|
23
|
-
}
|
|
24
|
-
}
|
|
25
|
-
]
|
|
26
|
-
}
|