evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +2 -3
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
evalscope/__init__.py,sha256=RY0EjssSquqqsysRobElYm9Ix6E41uTXeaeh7lI7kqs,106
|
|
2
|
+
evalscope/arguments.py,sha256=nozBnog45l77jxTFH_lyyJkj04ER3yyIpICepc2tC1Y,3783
|
|
3
|
+
evalscope/config.py,sha256=KYS_O0RdAbruQhqP6mp3rQL0003Oaskx03IroZUGRps,6897
|
|
4
|
+
evalscope/constants.py,sha256=D2MU7bs_qwmcHQ1ge05C5Ekk04XqMyiGxssvKwAecxI,4515
|
|
5
|
+
evalscope/run.py,sha256=5cG81qfdpMN_GtPphvJ7BHboD6LBYHWyodX8ViR1XL4,8874
|
|
6
|
+
evalscope/run_arena.py,sha256=Kmzak4TGdATbOhOCe_zLLRxDvgtkOfs6e4VaxOAzPKk,8550
|
|
7
|
+
evalscope/summarizer.py,sha256=Eq7ZqGKuvrhWVeGriLxHCGupgnJmtvmIGqZYzRNaY8I,6480
|
|
8
|
+
evalscope/version.py,sha256=Xha7v5_YH0Oppyh6iO7HrpSsmv1WCPdQPFtzYTJvG4A,118
|
|
9
|
+
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
evalscope/backend/base.py,sha256=l7zUHXX2XToIfU_hkVeTSHT9wWURYumyohXCIgywZBI,1021
|
|
11
|
+
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
12
|
+
evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
|
|
13
|
+
evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-oyH3uMGL7M3nPp1WiHU,10381
|
|
14
|
+
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
15
|
+
evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
|
|
16
|
+
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
|
|
17
|
+
evalscope/backend/rag_eval/__init__.py,sha256=jFWj8l8bPAu1sz7wtX5gGIweBFC8c2LzXUPz7tGambE,284
|
|
18
|
+
evalscope/backend/rag_eval/backend_manager.py,sha256=Cw322R1j-L8vMERAWEXUTT-0a1K-V6KhQOtrOhgKVMM,2857
|
|
19
|
+
evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
|
|
20
|
+
evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
|
|
21
|
+
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y0LNBjvttSXppc99gbz-f0TYQjnyLLyU,8347
|
|
22
|
+
evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=2NQRvlYY2SOzvOOj9WRLyxvRlyj8CAcgbQqgsv-Xjgw,3929
|
|
23
|
+
evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=CQnWZZTQ0FOzDtmGv7OF0W4Cv4g6u4_LQ93koDu1pes,2556
|
|
25
|
+
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=L0WYiy3Rgar0uMZRI-kz1qCEuUaFXwcsVj0CACG13ms,7439
|
|
26
|
+
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0Uq7W0sPDBJS1rqp70KgSfeRQ3c7u8YeGhj5Yiu6rk,5646
|
|
27
|
+
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
|
|
28
|
+
evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
|
|
29
|
+
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
|
|
30
|
+
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=Z3GkGi7zjK85JynG-7CSVPmAxPRcGYuykkgfbxgn7_E,2317
|
|
31
|
+
evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
|
|
32
|
+
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=FyFs1reefcsFCrWyi7Ya5dnFYvBhtxph2wIaFtOtFls,2595
|
|
33
|
+
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
|
|
34
|
+
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=-GTwORxILSkkXXGtTxuPTKSHNXQEllCRoUjuR7pnwFM,8962
|
|
35
|
+
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=pbZBfjeVAKbjLy4tEk6KUVDv-Rv8HNHYWuNkfqf-Vwk,2025
|
|
36
|
+
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=yISp67pXw4fSrsqTiYmfas6uPyqwE45L1c58Tpydc0E,4075
|
|
37
|
+
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=AH7jwJ45WAVxVb60I2DTURVanIAbrlZzk-ey_dHWEO0,5491
|
|
38
|
+
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9GUpuEEmcWwc78Q7ZJjRDZs,11454
|
|
39
|
+
evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=uhGLsQTo5lM3-L2Na3WJGqOLQw3c1WxHDA22ePJPxtU,12285
|
|
40
|
+
evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=PKBNyp45hIa3FYNA1psiwtwfwUcn7s9eNt6r5aUpyyY,1505
|
|
41
|
+
evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3HznsUUewdIAa_-LM,171
|
|
42
|
+
evalscope/backend/rag_eval/ragas/arguments.py,sha256=8SYCV15d25ocdDHRqmGMQzd9zR6gwfOrVSFBe4T-KCo,1806
|
|
43
|
+
evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
|
|
44
|
+
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
|
|
45
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=4wPfjNh-OVFQdvho3CAJ66_B2TZuRZVm6-xUIXokKcY,3935
|
|
46
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=wWidnp8726hf6-fY31ZoqCt9zhZgVM260o8MwdBI0d8,1737
|
|
47
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=o5RXPz-O1JM8gFRCLCY2iobh0uLc4mznT_zLCpWaPFE,968
|
|
48
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=eEs6gdAKuYfDohCz9EzM1o0ykIEUbvwoRu1Pd2dL92E,3168
|
|
49
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json,sha256=qZhHR9Ki374Ykb6V8dmptE1whXmPKRvAJ0Gl2akoaX0,216
|
|
50
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=k5LjoxcIDM9Yvj0h5bje6ANXEOgFbioRs1i23259Md8,2486
|
|
51
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=Pn1rGIjfyIeY6BZQEOeR4v-QC5xcmTN6aIh0G2E2Xuo,1740
|
|
52
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=p7RrFdNWY1Wo5s03SvtXQSZ-CEn96NkPZ3EHsJ3UIFE,1137
|
|
53
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=s2mlf9BTWnmnCZ9H3yLZgPvPUPWnPgIIDtRtH0qStMM,991
|
|
54
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=s_6K6surhTGpr5efryHjW-PFDKlYJTTpgXDlC_TbzVw,1943
|
|
55
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
|
|
56
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=p-vCyibNNezGcuID2kGvBDZJGdPXm3NvTTVvH6ij7N4,1973
|
|
57
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
|
|
58
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=yayuzrNO2EO9eIqSv5mthNTVXnw_7D_HOJZ_tse-qw0,1374
|
|
59
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=-rOBZuhZGbVrlti3PycavxAoInEry3dMYt9VN3Qvo-E,1475
|
|
60
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=svZ_xzfQp3KMzdVJoqTVPGnwgls2JjXXplTcUj1jVFo,767
|
|
61
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=VRO9Hy-e5Dba1AkLqxj2R-Ezwoby3BvipM9zNlZJ4GY,1328
|
|
62
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
|
|
63
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=1YVcklCc4otS0mkO0aiNNFx7Zecc1L3wB6ol3NPxTt0,697
|
|
64
|
+
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=c70_FGepQm3_dZngdjNudX_iCmu39tvZncyBqNxMrfg,658
|
|
65
|
+
evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
|
|
66
|
+
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
|
|
67
|
+
evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
|
|
68
|
+
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=B5ZETlQw5XTEDnO-VR5yXjSbbg1eUtjGts7M5msK2ik,5618
|
|
69
|
+
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_YF82SXLpkxoJ4nUurmdKSEoJ-qsLY,2129
|
|
70
|
+
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
+
evalscope/backend/rag_eval/utils/clip.py,sha256=WZovQJGyPI33Y-9bUnanR6fIYJzrXgnjD4zVwUJSgCw,5002
|
|
72
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=XWI07YeWDALc2etP4DGluYqrid85nKz1tjM91JLZRmM,6252
|
|
73
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=619eP8pXUcwIBaktBrGNA17j53j9jfg_1JeFDYzMCIE,2582
|
|
74
|
+
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
75
|
+
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
76
|
+
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
77
|
+
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
78
|
+
evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
|
|
79
|
+
evalscope/benchmarks/benchmark.py,sha256=DnLgr__CzE4DICK3u3ZMeFY0sVktefmYh2Yql2swEhg,1796
|
|
80
|
+
evalscope/benchmarks/data_adapter.py,sha256=hSW-tyTXxUPS_FnsMYAxxw9e4N7jS5eLiBHgCFAQNeo,10287
|
|
81
|
+
evalscope/benchmarks/arc/__init__.py,sha256=9GBWGArac-s9igD8lnoEEKnpSQYNaHA8fVKonLimkrQ,360
|
|
82
|
+
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
83
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=3q74hZB9G3X0-pQPzBk_a8wZIedmIlDHZBb4aUaBGRA,9197
|
|
84
|
+
evalscope/benchmarks/bbh/__init__.py,sha256=PcIMfTe4h5m-efBhnYQt6J-6O0qHFHGfuosRhk1Lhfo,303
|
|
85
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=UeNEEea5jqT7sYLpGGzvnxDdy6SrffM8H7gnVRpfGTw,10699
|
|
86
|
+
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
87
|
+
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
88
|
+
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
89
|
+
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt,sha256=N28JGB5_023fBzoo5HImvjz3A0zPZGoiTMPngQY8pNo,3568
|
|
90
|
+
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt,sha256=pkUw0ezfvNgbYzUKCchJ6o7bg6UCVL_62LWqS6lKZY0,2405
|
|
91
|
+
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt,sha256=LDRYUp6fGvG6K2l69VvDF5Szf-CUtgPqobWaQ3MHJ7A,4477
|
|
92
|
+
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt,sha256=H4LkGVmx3U4F6vuqRYXKDpHOBCu7MYksLFWp1QfyDPk,4831
|
|
93
|
+
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt,sha256=H8BorN-CyUrf0vrIANSgEILynJhpS02CiGjn-qad9NQ,3114
|
|
94
|
+
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt,sha256=0e-abSxfoGJC8aYYtRCHlK_2UkiFwffSnv4iN7XXPLs,2505
|
|
95
|
+
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt,sha256=0e-abSxfoGJC8aYYtRCHlK_2UkiFwffSnv4iN7XXPLs,2505
|
|
96
|
+
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt,sha256=0e-abSxfoGJC8aYYtRCHlK_2UkiFwffSnv4iN7XXPLs,2505
|
|
97
|
+
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt,sha256=Q4XZmrnTL1r8JCcB0mvJnb3oNUj45qjM-AfNK2ElWOQ,2121
|
|
98
|
+
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt,sha256=YJ7chn5QrpNm8VHHqG2B7gYwBUt08fyT_qHhcc9KT-Y,2386
|
|
99
|
+
evalscope/benchmarks/bbh/cot_prompts/navigate.txt,sha256=n3Evl10cdk8VeMfZgUdu3knBH64LmLY5d4cQTnGMLuU,2147
|
|
100
|
+
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt,sha256=SMQGqNi8JVCEVWcVVgQDedzKjslZSxHLcP68ECWX-Xc,1418
|
|
101
|
+
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt,sha256=MlrdrvrBaUcW7VjWLLdN-O_yfwVFfYWHobGq099Cyhs,2386
|
|
102
|
+
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt,sha256=jcL33cVyscRutNM793hWCryMBWQ-JFLip0DGM1UdAUc,2295
|
|
103
|
+
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt,sha256=StrmTfLxHtvx4QM-zf1V2u8u1VQSxnZrI7Mwiizvjyw,3481
|
|
104
|
+
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt,sha256=fJKB5sYhH0an4Oeqm53RDIu4mExZJVijCvGzje0nLz4,6141
|
|
105
|
+
evalscope/benchmarks/bbh/cot_prompts/snarks.txt,sha256=tvp4IAtaSNv8CKKeRx_G_PTVMICkenBNmMaq10SNXAE,3114
|
|
106
|
+
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt,sha256=yRYmj1f0fwY8tiXTj_iiBYz5u4E4n7Sd3r0bJXHjSco,821
|
|
107
|
+
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt,sha256=-qG7hItFjeahSB0EVvcikmLIR08P_fTIC-J38eV2fyk,3023
|
|
108
|
+
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
|
|
109
|
+
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
|
|
110
|
+
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
|
|
111
|
+
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
112
|
+
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
113
|
+
evalscope/benchmarks/ceval/__init__.py,sha256=vBN_OgmcvKglYIu96nRoT2wD8FDdM3cRoTB-dqlmbLg,393
|
|
114
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1J_WquXRPw-pRHBiYn7ZxRVSjjvWDqRUJLa8nvT1vYk,15050
|
|
115
|
+
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
116
|
+
evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
|
|
117
|
+
evalscope/benchmarks/cmmlu/__init__.py,sha256=9M_Lo5-ePaD6hWG-Y-_i-U79yTOKadtHPG7zFvekwN4,393
|
|
118
|
+
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
119
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=malBAKNtDbfJ-kJoQUQTYYQ18MTJST63bgcsLiiktlw,13956
|
|
120
|
+
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
121
|
+
evalscope/benchmarks/competition_math/__init__.py,sha256=CDK03RXT-X21WcIAlkrCs0rCSiHe-yTY0nwM6-l75nI,465
|
|
122
|
+
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
123
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=cHWJ6LLIWvftFXjGrOidMlZ1RGUFxPgDjs4wmBPSm1Y,18862
|
|
124
|
+
evalscope/benchmarks/general_qa/__init__.py,sha256=N2t-ehNrl9eVAarlSgJvRapm9yOjhfCWhNPPfcUUy-s,409
|
|
125
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=Y7_d6hmh94W2XbzUnDMX9_uKWcarK0zv4Q4mQWUfSZ8,5869
|
|
126
|
+
evalscope/benchmarks/gsm8k/__init__.py,sha256=CtcG_QM8m5zmvMs2N53d7kcm4_hIgsO2qYPyx-71aLw,313
|
|
127
|
+
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
128
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=gg65W_pz4mPOBUOwaYIgfUxGKzrmRZRuoEg5xtS8bYg,13830
|
|
129
|
+
evalscope/benchmarks/hellaswag/__init__.py,sha256=cY1kluaTqC7AvyzwlQYc3BF_kB3LD1gOpg6i7RDr0cI,415
|
|
130
|
+
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
131
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=7REJeC8vD8OVtmcqI5TP6cTn88-KOzBs5oOKEZEmESs,8459
|
|
132
|
+
evalscope/benchmarks/humaneval/__init__.py,sha256=lqSlAf1-8Nzhc1j89sj6yAcaLt9pGhqu15M84bmzamc,333
|
|
133
|
+
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
134
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=HxAjkIA-Wt5-wb8kNSDMzZRoHflgsNxIfa1BoeVzwog,1660
|
|
135
|
+
evalscope/benchmarks/mmlu/__init__.py,sha256=OGiN1J80WDM72y242o7diYT9Rl-jkVEqTNntCl8Vt4M,385
|
|
136
|
+
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
137
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8T-fN_Az0gWOyME9nHl3MvcD144TjWknFKcEOMHppAI,15494
|
|
138
|
+
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
139
|
+
evalscope/benchmarks/race/__init__.py,sha256=HVda-CB-Q-N8RbwiVLADXYNY6VLUH-frJ8VCc3jm0Mk,385
|
|
140
|
+
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
141
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=Ppo7bttx15zB-m-UtguIwIXgqpEKAi_ClIOol0hPQiE,9805
|
|
142
|
+
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
143
|
+
evalscope/benchmarks/trivia_qa/__init__.py,sha256=eLMVC6tfwty5HqrQuGyWeAF2IhRNajWoO1SkLVemQj4,409
|
|
144
|
+
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
145
|
+
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
146
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=YFatCVNM7I0YUttBznQMohmMkm3qxJpCSVxf6o_sgHk,7663
|
|
147
|
+
evalscope/benchmarks/truthful_qa/__init__.py,sha256=EZOaHn13NS3ddHpS62ija8jz71SxOOsqcQRVg69e_Ho,429
|
|
148
|
+
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
149
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=XFnZXQJpHEe_xP_HImPHa8qrwojywnWAgeSaJAYB0oU,14916
|
|
150
|
+
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
151
|
+
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
152
|
+
evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
|
|
153
|
+
evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
|
|
154
|
+
evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
|
|
155
|
+
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
156
|
+
evalscope/evaluator/__init__.py,sha256=h_EyZm7vDqBsGx6CkoQVLg0aMy0tE_IG5uEnheubb0s,174
|
|
157
|
+
evalscope/evaluator/evaluator.py,sha256=MGkuJi9o5Hdbj_fN7qolDqP0B47i9i0ksGd1uc-TMn0,18365
|
|
158
|
+
evalscope/evaluator/humaneval_evaluator.py,sha256=245XRxwulGQpjdapwU8CiYJn1xT0XKxl7hdWvzFxLG0,5964
|
|
159
|
+
evalscope/evaluator/rating_eval.py,sha256=VuDIZcmSlsv1tc8znDGesz8ZwpQ7NvZJPv823Quvht0,5566
|
|
160
|
+
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
161
|
+
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=YVTJAHK0uz9hNupsdeTXMM2PISECf8phXq0GYPr4law,16378
|
|
162
|
+
evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
163
|
+
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
164
|
+
evalscope/metrics/math_accuracy.py,sha256=WqLfACuIeVFrX4q6_c2exnTLn2t10-rjv6sfxcqJJ14,1965
|
|
165
|
+
evalscope/metrics/metrics.py,sha256=9Qj2KuSmaLOPhpGdBfiKGKVTIxHCuk0CPKI2b6L1zb8,12589
|
|
166
|
+
evalscope/metrics/rouge_metric.py,sha256=oB-rBgMnavZSyOiAefg--OXdGfffKrET5bUmrx3nmx0,4408
|
|
167
|
+
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
168
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=9YdE38duhBFsmFLkY7HXDCQqUNavB5Hh3kaB4WTjAII,11971
|
|
169
|
+
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
|
|
170
|
+
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
171
|
+
evalscope/models/__init__.py,sha256=b-jXJ2Cj6dH8notAU7lvCVKbGrcEaf8Gfr5w79qNHAk,111
|
|
172
|
+
evalscope/models/dummy_chat_model.py,sha256=aG3yolnnIN_-gsfF9FsyjyGMewQteEnUfOxTGScROSE,1272
|
|
173
|
+
evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
|
|
174
|
+
evalscope/models/model_adapter.py,sha256=XBeSFTR9pXmnhFWRRddcobnITC5T4JKooeFUeWEtUVI,19006
|
|
175
|
+
evalscope/models/openai_model.py,sha256=-tPBu6v0Ogf_flmG88tFuu66QNKrOyxv3AjYwVtuR44,3313
|
|
176
|
+
evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
|
|
177
|
+
evalscope/models/api/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
|
|
178
|
+
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
179
|
+
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
180
|
+
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
181
|
+
evalscope/perf/arguments.py,sha256=_gW1tq7SbrAZd05N-FbY_oWrQB0Djs4KUaFdXSfFsr8,9112
|
|
182
|
+
evalscope/perf/benchmark.py,sha256=ff9PFFMY5UucuUihcdo6lSf1X9XXoaOmrpBvjDk5Mrw,9599
|
|
183
|
+
evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
|
|
184
|
+
evalscope/perf/main.py,sha256=-8NsvJZ7uyVfJT9N2lX36KfsHkVTy0r8OcsWPYoKms0,1316
|
|
185
|
+
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
186
|
+
evalscope/perf/plugin/registry.py,sha256=PyK3E1AqQFuU4Bs9COvFFCJOaCtmHbfeQOVGtjVYh-I,1304
|
|
187
|
+
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
188
|
+
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
189
|
+
evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
|
|
190
|
+
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
191
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=KRN6EjObTG08mcI82kJD3dGK7DoVMUZzrUZ1AgoLEp0,7007
|
|
192
|
+
evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
|
|
193
|
+
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
194
|
+
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
195
|
+
evalscope/perf/plugin/datasets/flickr8k.py,sha256=CGYtmRw71-ycJIObAHm2gmmJl_1MXPJOwmHV-0WS8DY,1581
|
|
196
|
+
evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
|
|
197
|
+
evalscope/perf/plugin/datasets/longalpaca.py,sha256=Yx5nxHGkmD4lJOJ-jcyqm2ZsGAxotJc77jUCkO1z0a4,1164
|
|
198
|
+
evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
|
|
199
|
+
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
200
|
+
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
201
|
+
evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0awWiXEIkCI,1212
|
|
202
|
+
evalscope/perf/utils/benchmark_util.py,sha256=xFZSSUoBoFpHRZC69-KS9cK2vqJlL7rIuCEz_MnpnGA,5564
|
|
203
|
+
evalscope/perf/utils/db_util.py,sha256=A2K3otCrNw3K1SMwoYo8a6jekT5nAVvWJepqi31DH28,7479
|
|
204
|
+
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
205
|
+
evalscope/perf/utils/local_server.py,sha256=31EQZ8S_SzgSiBFpc9zRU13GXm2jREvRmPDN5qWKgbg,4468
|
|
206
|
+
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
207
|
+
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
208
|
+
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
209
|
+
evalscope/registry/config/cfg_pairwise_baseline.yaml,sha256=d05pBiqOk1ejcdd9XE-opZ_ersyttAesF3Iwa2df8O8,3580
|
|
210
|
+
evalscope/registry/config/cfg_single.yaml,sha256=zjsUC3zhU8z7JURaJiz7npkUbFpP82q1ycqUmObC-hc,3056
|
|
211
|
+
evalscope/registry/data/question.jsonl,sha256=WQw5FXvFYerdfwPK1L4YwrWX-TApeAr2X4Zxjznq-oc,12885
|
|
212
|
+
evalscope/registry/data/prompt_template/lmsys_v2.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
|
|
213
|
+
evalscope/registry/data/prompt_template/prompt_templates.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
|
|
214
|
+
evalscope/registry/data/qa_browser/battle.jsonl,sha256=2MXcYoMItBmttQxSMh2Oa0x51xxqJaWEgSuERUx1O_0,1185590
|
|
215
|
+
evalscope/registry/data/qa_browser/category_mapping.yaml,sha256=3r9nUIciW9205qbtOQF7aI_etM191cM3vlWU8ueG2Co,484
|
|
216
|
+
evalscope/registry/tasks/arc.yaml,sha256=MghUuCmZPEwGqwYhA8ClRWHiSwC3kbHcKMRicQl9aqc,765
|
|
217
|
+
evalscope/registry/tasks/bbh.yaml,sha256=GE3PpE8zw_SROj41LZ5bTm6ZXXZjYOorAdwBCTEePXM,604
|
|
218
|
+
evalscope/registry/tasks/bbh_mini.yaml,sha256=8o9ZiWaCTkN2uTwiOhjBQuyKm7GUw6ZfUZxb2bkOmvs,678
|
|
219
|
+
evalscope/registry/tasks/ceval.yaml,sha256=XDaszb7DROKk8nQDiklirTvDJwkOUJtIN_tcUFVvIJk,703
|
|
220
|
+
evalscope/registry/tasks/ceval_mini.yaml,sha256=4aYW4c0IzgAXSs5dp4d8dJ0OHVp5sD4uiRjChjL1zZg,672
|
|
221
|
+
evalscope/registry/tasks/cmmlu.yaml,sha256=yOgKl1jmfcAfTuUcIMmG5SQhkrbEHEyyP3YuCuIN3l0,703
|
|
222
|
+
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml,sha256=egdiM5oG7RSs0M-g8QNikwhJ9tZVgw5FiLy-rIYYHAA,737
|
|
223
|
+
evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNiAhGEdUqL-8c,702
|
|
224
|
+
evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
|
|
225
|
+
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
226
|
+
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
227
|
+
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
228
|
+
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
229
|
+
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
230
|
+
evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
|
|
231
|
+
evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
|
|
232
|
+
evalscope/third_party/longbench_write/eval.py,sha256=bZrpaKg9sPXv2VkUxLpfJiNqMIoIj7Pf3eFMqmDncyY,11229
|
|
233
|
+
evalscope/third_party/longbench_write/infer.py,sha256=bFsOp--8Qn6qQ-NpdLY0bennQGQl5TMGEngvGda8k7g,4937
|
|
234
|
+
evalscope/third_party/longbench_write/longbench_write.py,sha256=1caNiJvmZL2vwDU6oHUE4cdCViZGYE8yBo9EsMcA-Qw,3955
|
|
235
|
+
evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
|
|
236
|
+
evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
237
|
+
evalscope/third_party/longbench_write/resources/judge.txt,sha256=Go1ISY4bUBmEDXXY_DItjAmskuHSaRj5WTNMNH98FSk,1885
|
|
238
|
+
evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26ZSXzCTWWJTWXgFAYvOYupRuvdJUt_izOeSNOrV3k,54155
|
|
239
|
+
evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
|
|
240
|
+
evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
|
|
241
|
+
evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
242
|
+
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=nmWKOrD-GeZi0ZGH5jLCGuW3qiLTui8ASSxI2z8l6ls,5962
|
|
243
|
+
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
244
|
+
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
245
|
+
evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
|
|
246
|
+
evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2eexlehSi9LI4F3EPk-3JacrAb6ZoyxI,451
|
|
247
|
+
evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
|
|
248
|
+
evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
|
|
249
|
+
evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
|
|
250
|
+
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=y4nC9WCBCgBg378aWYAdhmrFte_r_XOkigJs7XJ_iXQ,1930
|
|
251
|
+
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
252
|
+
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
|
|
253
|
+
evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
254
|
+
evalscope/tools/combine_reports.py,sha256=1BJ29IEUKoZLM3NAzg_IpU8B9uhljO9-b_hqAYi9RpA,5078
|
|
255
|
+
evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
|
|
256
|
+
evalscope/tools/rewrite_eval_results.py,sha256=2lbDHfF_9abK1tUk2UYZZRwzO68eoiE36dXyh_b-mwg,2011
|
|
257
|
+
evalscope/utils/__init__.py,sha256=hDS1xpoAxtVH4-ZQOXstdg7WYmjcGPQ62Kh54FIgkwU,87
|
|
258
|
+
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
259
|
+
evalscope/utils/chat_service.py,sha256=N8lJPiVtzdqsHypa42wzb15T7hduXUrRPtU3Atf8yg4,8641
|
|
260
|
+
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
261
|
+
evalscope/utils/logger.py,sha256=IkY0oxkWSvfA0z1m79crioTiqQcnxulNF5HtJNlV0Fc,3174
|
|
262
|
+
evalscope/utils/model_utils.py,sha256=zMS1YRu4CzU4CVLZS6e_lgfHIDBqv3YBTJbPF1R2M90,443
|
|
263
|
+
evalscope/utils/utils.py,sha256=PVtpv3WAIm6Bs66Vz4KBDiAiXp8y6Oejxxr1LWHTRsI,15146
|
|
264
|
+
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
265
|
+
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
266
|
+
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
267
|
+
tests/cli/test_run.py,sha256=lXR35DDLQjdb-XGA6pKnQC9pJTfTOHjknAN7PEaw8G4,4334
|
|
268
|
+
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
269
|
+
tests/perf/test_perf.py,sha256=GD5nInXpQG7H1E8wI6dvy4DFSvTEddGDzv-Cu8YV1ts,2995
|
|
270
|
+
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
271
|
+
tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
|
|
272
|
+
tests/rag/test_mteb.py,sha256=CaEJ0f1M06Z90c72FQb9z23IC_KZtkURWsc_oRMgQn8,4609
|
|
273
|
+
tests/rag/test_ragas.py,sha256=N_mUBIyxdQ1REzjkoI2sBNluKLLmKatLc3VY1o9uPck,3947
|
|
274
|
+
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
275
|
+
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
276
|
+
tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
|
|
277
|
+
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
278
|
+
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
279
|
+
tests/vlm/test_vlmeval.py,sha256=21xi0nu4ghDB6_X-Pol7pTfK7aYkAYOp82TQ-MSQv-I,1757
|
|
280
|
+
evalscope-0.8.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
281
|
+
evalscope-0.8.0.dist-info/METADATA,sha256=5RKZaNBwuJj84sdAXlNmT11Bm8kGYha6EYnqszwZ1Qk,23190
|
|
282
|
+
evalscope-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
283
|
+
evalscope-0.8.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
284
|
+
evalscope-0.8.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
285
|
+
evalscope-0.8.0.dist-info/RECORD,,
|
tests/cli/test_run.py
CHANGED
|
@@ -1,17 +1,22 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
2
|
+
import os
|
|
3
3
|
import subprocess
|
|
4
|
+
import torch
|
|
4
5
|
import unittest
|
|
5
|
-
|
|
6
|
+
|
|
7
|
+
from evalscope.run import run_task
|
|
8
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
6
9
|
from evalscope.utils.logger import get_logger
|
|
7
10
|
|
|
11
|
+
os.environ['LOG_LEVEL'] = 'DEBUG'
|
|
12
|
+
|
|
8
13
|
logger = get_logger()
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
class TestRun(unittest.TestCase):
|
|
12
17
|
|
|
13
18
|
def setUp(self) -> None:
|
|
14
|
-
logger.info(
|
|
19
|
+
logger.info('Init env for evalscope native run UTs ...\n')
|
|
15
20
|
self._check_env('evalscope')
|
|
16
21
|
|
|
17
22
|
def tearDown(self) -> None:
|
|
@@ -26,14 +31,12 @@ class TestRun(unittest.TestCase):
|
|
|
26
31
|
|
|
27
32
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
28
33
|
def test_run_simple_eval(self):
|
|
29
|
-
model = '
|
|
30
|
-
template_type = 'chatglm3'
|
|
34
|
+
model = 'qwen/Qwen2-0.5B-Instruct'
|
|
31
35
|
datasets = 'arc' # arc ceval
|
|
32
|
-
limit =
|
|
36
|
+
limit = 10
|
|
33
37
|
|
|
34
|
-
cmd_simple = f'
|
|
38
|
+
cmd_simple = f'evalscope eval ' \
|
|
35
39
|
f'--model {model} ' \
|
|
36
|
-
f'--template-type {template_type} ' \
|
|
37
40
|
f'--datasets {datasets} ' \
|
|
38
41
|
f'--limit {limit}'
|
|
39
42
|
|
|
@@ -46,15 +49,13 @@ class TestRun(unittest.TestCase):
|
|
|
46
49
|
|
|
47
50
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
48
51
|
def test_run_eval_with_args(self):
|
|
49
|
-
model = '
|
|
50
|
-
|
|
51
|
-
datasets = 'arc ceval' # arc ceval
|
|
52
|
+
model = 'qwen/Qwen2-0.5B-Instruct'
|
|
53
|
+
datasets = 'arc' # arc ceval
|
|
52
54
|
limit = 5
|
|
53
55
|
dataset_args = '{"ceval": {"few_shot_num": 0, "few_shot_random": false}}'
|
|
54
56
|
|
|
55
|
-
cmd_with_args = f'
|
|
57
|
+
cmd_with_args = f'evalscope eval ' \
|
|
56
58
|
f'--model {model} ' \
|
|
57
|
-
f'--template-type {template_type} ' \
|
|
58
59
|
f'--datasets {datasets} ' \
|
|
59
60
|
f'--limit {limit} ' \
|
|
60
61
|
f'--generation-config do_sample=false,temperature=0.0 ' \
|
|
@@ -68,9 +69,47 @@ class TestRun(unittest.TestCase):
|
|
|
68
69
|
logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
|
|
69
70
|
|
|
70
71
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
71
|
-
def
|
|
72
|
-
|
|
72
|
+
def test_run_task(self):
|
|
73
|
+
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['gsm8k'], 'limit': 2, 'debug': False}
|
|
74
|
+
run_task(task_cfg=task_cfg)
|
|
73
75
|
|
|
74
76
|
|
|
77
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
78
|
+
def test_run_custom_task(self):
|
|
79
|
+
from evalscope.config import TaskConfig
|
|
80
|
+
|
|
81
|
+
task_cfg = TaskConfig(
|
|
82
|
+
model='qwen/Qwen2-0.5B-Instruct',
|
|
83
|
+
datasets=['ceval'], # 数据格式,选择题格式固定为 'ceval'
|
|
84
|
+
dataset_args={
|
|
85
|
+
'ceval': {
|
|
86
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
87
|
+
'subset_list': [
|
|
88
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
)
|
|
93
|
+
run_task(task_cfg=task_cfg)
|
|
94
|
+
|
|
95
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
96
|
+
def test_run_custom_qa(self):
|
|
97
|
+
from evalscope.config import TaskConfig
|
|
98
|
+
|
|
99
|
+
task_cfg = TaskConfig(
|
|
100
|
+
model='qwen/Qwen2-0.5B-Instruct',
|
|
101
|
+
datasets=['general_qa'], # 数据格式,选择题格式固定为 'ceval'
|
|
102
|
+
dataset_args={
|
|
103
|
+
'general_qa': {
|
|
104
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
105
|
+
'subset_list': [
|
|
106
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
107
|
+
]
|
|
108
|
+
}
|
|
109
|
+
},
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
run_task(task_cfg=task_cfg)
|
|
113
|
+
|
|
75
114
|
if __name__ == '__main__':
|
|
76
115
|
unittest.main()
|
tests/perf/test_perf.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
|
+
|
|
4
|
+
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
3
5
|
import unittest
|
|
4
6
|
|
|
5
7
|
from evalscope.perf.main import run_perf_benchmark
|
|
@@ -49,6 +51,8 @@ class TestPerf(unittest.TestCase):
|
|
|
49
51
|
'model': 'qwen2.5',
|
|
50
52
|
'api': 'openai',
|
|
51
53
|
'dataset': 'speed_benchmark',
|
|
54
|
+
'min_tokens': 2048,
|
|
55
|
+
'max_tokens': 2048,
|
|
52
56
|
'debug': True,
|
|
53
57
|
}
|
|
54
58
|
run_perf_benchmark(task_cfg)
|
tests/rag/test_clip_benchmark.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
-
|
|
5
4
|
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
|
6
5
|
import subprocess
|
|
7
6
|
import unittest
|
|
8
|
-
|
|
9
|
-
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
10
8
|
from evalscope.run import run_task
|
|
9
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
13
13
|
|
|
@@ -15,7 +15,7 @@ logger = get_logger()
|
|
|
15
15
|
class TestCLIPBenchmark(unittest.TestCase):
|
|
16
16
|
|
|
17
17
|
def setUp(self) -> None:
|
|
18
|
-
self._check_env(
|
|
18
|
+
self._check_env('webdataset')
|
|
19
19
|
|
|
20
20
|
def tearDown(self) -> None:
|
|
21
21
|
pass
|
|
@@ -23,57 +23,57 @@ class TestCLIPBenchmark(unittest.TestCase):
|
|
|
23
23
|
@staticmethod
|
|
24
24
|
def _check_env(module_name: str):
|
|
25
25
|
if is_module_installed(module_name):
|
|
26
|
-
logger.info(f
|
|
26
|
+
logger.info(f'{module_name} is installed.')
|
|
27
27
|
else:
|
|
28
|
-
raise ModuleNotFoundError(f
|
|
28
|
+
raise ModuleNotFoundError(f'run: pip install {module_name}')
|
|
29
29
|
|
|
30
|
-
@unittest.skipUnless(0 in test_level_list(),
|
|
30
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
31
31
|
def test_run_task(self):
|
|
32
32
|
task_cfg = {
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
33
|
+
'eval_backend': 'RAGEval',
|
|
34
|
+
'eval_config': {
|
|
35
|
+
'tool': 'clip_benchmark',
|
|
36
|
+
'eval': {
|
|
37
|
+
'models': [
|
|
38
38
|
{
|
|
39
|
-
|
|
39
|
+
'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
|
|
40
40
|
}
|
|
41
41
|
],
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
42
|
+
'dataset_name': ['muge', 'mnist'],
|
|
43
|
+
'split': 'test',
|
|
44
|
+
'batch_size': 128,
|
|
45
|
+
'num_workers': 1,
|
|
46
|
+
'verbose': True,
|
|
47
|
+
'skip_existing': False,
|
|
48
|
+
'output_dir': 'outputs',
|
|
49
|
+
'cache_dir': 'cache',
|
|
50
|
+
'limit': 1000,
|
|
51
51
|
},
|
|
52
52
|
},
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
run_task(task_cfg)
|
|
56
56
|
|
|
57
|
-
@unittest.skipUnless(0 in test_level_list(),
|
|
57
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
58
58
|
def test_run_custom(self):
|
|
59
59
|
task_cfg = {
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
60
|
+
'eval_backend': 'RAGEval',
|
|
61
|
+
'eval_config': {
|
|
62
|
+
'tool': 'clip_benchmark',
|
|
63
|
+
'eval': {
|
|
64
|
+
'models': [
|
|
65
65
|
{
|
|
66
|
-
|
|
66
|
+
'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
|
|
67
67
|
}
|
|
68
68
|
],
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
69
|
+
'dataset_name': ['custom'],
|
|
70
|
+
'data_dir': 'custom_eval/multimodal/text-image-retrieval',
|
|
71
|
+
'split': 'test',
|
|
72
|
+
'batch_size': 128,
|
|
73
|
+
'num_workers': 1,
|
|
74
|
+
'verbose': True,
|
|
75
|
+
'skip_existing': False,
|
|
76
|
+
'limit': 1000,
|
|
77
77
|
},
|
|
78
78
|
},
|
|
79
79
|
}
|
|
@@ -81,5 +81,5 @@ class TestCLIPBenchmark(unittest.TestCase):
|
|
|
81
81
|
run_task(task_cfg)
|
|
82
82
|
|
|
83
83
|
|
|
84
|
-
if __name__ ==
|
|
84
|
+
if __name__ == '__main__':
|
|
85
85
|
unittest.main(buffer=False)
|
tests/rag/test_mteb.py
CHANGED
|
@@ -2,9 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
import subprocess
|
|
4
4
|
import unittest
|
|
5
|
-
|
|
6
|
-
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
7
6
|
from evalscope.run import run_task
|
|
7
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
8
9
|
|
|
9
10
|
logger = get_logger()
|
|
10
11
|
|
tests/rag/test_ragas.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
3
|
import unittest
|
|
4
|
-
|
|
5
|
-
from evalscope.utils.logger import get_logger
|
|
4
|
+
|
|
6
5
|
from evalscope.run import run_task
|
|
6
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
7
8
|
|
|
8
9
|
logger = get_logger()
|
|
9
10
|
|
|
@@ -40,7 +41,6 @@ class TestRAGAS(unittest.TestCase):
|
|
|
40
41
|
},
|
|
41
42
|
'generator_llm': {
|
|
42
43
|
'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
|
|
43
|
-
'template_type': 'qwen',
|
|
44
44
|
},
|
|
45
45
|
'embeddings': {
|
|
46
46
|
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
@@ -64,7 +64,6 @@ class TestRAGAS(unittest.TestCase):
|
|
|
64
64
|
'testset_file': 'outputs/testset_chinese_with_answer.json',
|
|
65
65
|
'critic_llm': {
|
|
66
66
|
'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
|
|
67
|
-
'template_type': 'qwen',
|
|
68
67
|
},
|
|
69
68
|
'embeddings': {
|
|
70
69
|
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
@@ -90,7 +89,8 @@ class TestRAGAS(unittest.TestCase):
|
|
|
90
89
|
'eval_config': {
|
|
91
90
|
'tool': 'RAGAS',
|
|
92
91
|
'eval': {
|
|
93
|
-
'testset_file':
|
|
92
|
+
'testset_file':
|
|
93
|
+
'outputs/testset.json',
|
|
94
94
|
'critic_llm': {
|
|
95
95
|
'model_name': 'gpt-4o-mini', # 自定义聊天模型名称
|
|
96
96
|
'api_base': 'http://127.0.0.1:8088/v1', # 自定义基础URL
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
import os
|
|
5
|
+
import requests
|
|
4
6
|
import subprocess
|
|
5
7
|
import time
|
|
6
8
|
import unittest
|
|
7
9
|
|
|
8
|
-
import json
|
|
9
|
-
import requests
|
|
10
|
-
|
|
11
10
|
from evalscope.backend.opencompass import OpenCompassBackendManager
|
|
12
11
|
from evalscope.run import run_task
|
|
13
12
|
from evalscope.summarizer import Summarizer
|