evalscope 0.8.2__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.8.2/evalscope.egg-info → evalscope-0.9.0}/PKG-INFO +32 -15
- {evalscope-0.8.2 → evalscope-0.9.0}/README.md +31 -14
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/__init__.py +2 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/arguments.py +10 -3
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope-0.9.0/evalscope/benchmarks/__init__.py +23 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/arc/arc_adapter.py +23 -99
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope-0.9.0/evalscope/benchmarks/benchmark.py +76 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope-0.9.0/evalscope/benchmarks/competition_math/competition_math_adapter.py +126 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/data_adapter.py +114 -85
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope-0.9.0/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/race/race_adapter.py +25 -53
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope-0.9.0/evalscope/collections/__init__.py +3 -0
- evalscope-0.9.0/evalscope/collections/evaluator.py +178 -0
- evalscope-0.9.0/evalscope/collections/sampler.py +132 -0
- evalscope-0.9.0/evalscope/collections/schema.py +122 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/config.py +7 -5
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/constants.py +7 -28
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/evaluator/evaluator.py +66 -109
- evalscope-0.9.0/evalscope/evaluator/reviewer/__init__.py +1 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope-0.9.0/evalscope/metrics/__init__.py +7 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope-0.9.0/evalscope/metrics/math_accuracy.py +200 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/metrics.py +7 -4
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/rouge_metric.py +13 -8
- evalscope-0.9.0/evalscope/models/__init__.py +16 -0
- evalscope-0.9.0/evalscope/models/base_adapter.py +52 -0
- evalscope-0.9.0/evalscope/models/chat_adapter.py +138 -0
- evalscope-0.9.0/evalscope/models/choice_adapter.py +211 -0
- evalscope-0.9.0/evalscope/models/custom_adapter.py +67 -0
- evalscope-0.9.0/evalscope/models/local_model.py +74 -0
- evalscope-0.9.0/evalscope/models/model.py +229 -0
- evalscope-0.9.0/evalscope/models/server_adapter.py +104 -0
- evalscope-0.9.0/evalscope/registry/__init__.py +1 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/run.py +37 -66
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/run_arena.py +1 -1
- evalscope-0.9.0/evalscope/third_party/__init__.py +1 -0
- evalscope-0.9.0/evalscope/third_party/longbench_write/resources/__init__.py +1 -0
- evalscope-0.9.0/evalscope/third_party/longbench_write/tools/__init__.py +1 -0
- evalscope-0.9.0/evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope-0.9.0/evalscope/tools/__init__.py +1 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/__init__.py +1 -1
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/chat_service.py +4 -3
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/io_utils.py +8 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/logger.py +4 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/model_utils.py +10 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/utils.py +3 -25
- evalscope-0.9.0/evalscope/version.py +4 -0
- {evalscope-0.8.2 → evalscope-0.9.0/evalscope.egg-info}/PKG-INFO +32 -15
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/SOURCES.txt +14 -5
- evalscope-0.9.0/tests/__init__.py +1 -0
- evalscope-0.9.0/tests/cli/__init__.py +1 -0
- evalscope-0.9.0/tests/cli/test_collection.py +53 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/cli/test_run.py +43 -1
- evalscope-0.9.0/tests/perf/__init__.py +1 -0
- evalscope-0.9.0/tests/rag/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/rag/test_mteb.py +3 -2
- evalscope-0.9.0/tests/swift/__init__.py +1 -0
- evalscope-0.9.0/tests/vlm/__init__.py +1 -0
- evalscope-0.8.2/evalscope/benchmarks/__init__.py +0 -4
- evalscope-0.8.2/evalscope/benchmarks/arc/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/bbh/__init__.py +0 -5
- evalscope-0.8.2/evalscope/benchmarks/benchmark.py +0 -65
- evalscope-0.8.2/evalscope/benchmarks/ceval/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/cmmlu/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/competition_math/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -468
- evalscope-0.8.2/evalscope/benchmarks/general_qa/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/gsm8k/__init__.py +0 -5
- evalscope-0.8.2/evalscope/benchmarks/hellaswag/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/humaneval/__init__.py +0 -5
- evalscope-0.8.2/evalscope/benchmarks/mmlu/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/race/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/trivia_qa/__init__.py +0 -6
- evalscope-0.8.2/evalscope/benchmarks/truthful_qa/__init__.py +0 -6
- evalscope-0.8.2/evalscope/metrics/math_accuracy.py +0 -57
- evalscope-0.8.2/evalscope/models/__init__.py +0 -3
- evalscope-0.8.2/evalscope/models/api/__init__.py +0 -3
- evalscope-0.8.2/evalscope/models/dummy_chat_model.py +0 -49
- evalscope-0.8.2/evalscope/models/model.py +0 -88
- evalscope-0.8.2/evalscope/models/model_adapter.py +0 -525
- evalscope-0.8.2/evalscope/models/openai_model.py +0 -103
- evalscope-0.8.2/evalscope/version.py +0 -4
- {evalscope-0.8.2 → evalscope-0.9.0}/LICENSE +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/MANIFEST.in +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/base.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.8.2/evalscope/cli → evalscope-0.9.0/evalscope/benchmarks/arc}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.8.2/evalscope/evaluator/reviewer → evalscope-0.9.0/evalscope/benchmarks/bbh}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.8.2/evalscope/metrics → evalscope-0.9.0/evalscope/benchmarks/ceval}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/ceval/samples.jsonl +0 -0
- {evalscope-0.8.2/evalscope/registry → evalscope-0.9.0/evalscope/benchmarks/cmmlu}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.8.2/evalscope/third_party → evalscope-0.9.0/evalscope/benchmarks/competition_math}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.8.2/evalscope/third_party/longbench_write/resources → evalscope-0.9.0/evalscope/benchmarks/general_qa}/__init__.py +0 -0
- {evalscope-0.8.2/evalscope/third_party/longbench_write/tools → evalscope-0.9.0/evalscope/benchmarks/gsm8k}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.8.2/evalscope/third_party/toolbench_static/llm → evalscope-0.9.0/evalscope/benchmarks/hellaswag}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.8.2/evalscope/tools → evalscope-0.9.0/evalscope/benchmarks/humaneval}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.8.2/tests → evalscope-0.9.0/evalscope/benchmarks/mmlu}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.8.2/evalscope/perf → evalscope-0.9.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.8.2/tests/cli → evalscope-0.9.0/evalscope/benchmarks/race}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.8.2/tests/perf → evalscope-0.9.0/evalscope/benchmarks/trivia_qa}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.8.2/tests/swift → evalscope-0.9.0/evalscope/benchmarks/truthful_qa}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.8.2/tests/vlm → evalscope-0.9.0/evalscope/cli}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/base.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/cli.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.8.2/evalscope/perf/utils → evalscope-0.9.0/evalscope/perf}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/arguments.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/benchmark.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/main.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.8.2/tests/rag → evalscope-0.9.0/evalscope/perf/utils}/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/benchmark_util.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/summarizer.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.8.2/evalscope/models/api → evalscope-0.9.0/evalscope/third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/tools/combine_reports.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/tools/rewrite_eval_results.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/requires.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements/docs.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements/framework.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements/inner.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements/opencompass.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements/perf.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements/rag.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements/tests.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements/vlmeval.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/requirements.txt +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/setup.cfg +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/setup.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/perf/test_perf.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/test_run_all.py +0 -0
- {evalscope-0.8.2 → evalscope-0.9.0}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -160,14 +160,16 @@ Requires-Dist: unicorn; extra == "all"
|
|
|
160
160
|
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
161
161
|
|
|
162
162
|
## 📋 Contents
|
|
163
|
-
- [Introduction](
|
|
164
|
-
- [News](
|
|
165
|
-
- [Installation](
|
|
166
|
-
- [Quick Start](
|
|
163
|
+
- [Introduction](#-introduction)
|
|
164
|
+
- [News](#-news)
|
|
165
|
+
- [Installation](#️-installation)
|
|
166
|
+
- [Quick Start](#-quick-start)
|
|
167
167
|
- [Evaluation Backend](#evaluation-backend)
|
|
168
|
-
- [Custom Dataset Evaluation](
|
|
169
|
-
- [Model Serving Performance Evaluation](
|
|
170
|
-
- [Arena Mode](
|
|
168
|
+
- [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
169
|
+
- [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
170
|
+
- [Arena Mode](#-arena-mode)
|
|
171
|
+
- [Contribution](#️-contribution)
|
|
172
|
+
- [Roadmap](#-roadmap)
|
|
171
173
|
|
|
172
174
|
|
|
173
175
|
## 📝 Introduction
|
|
@@ -208,11 +210,15 @@ Please scan the QR code below to join our community groups:
|
|
|
208
210
|
|
|
209
211
|
|
|
210
212
|
## 🎉 News
|
|
213
|
+
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
211
214
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
212
215
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
213
216
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
214
217
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
215
218
|
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
219
|
+
|
|
220
|
+
<details><summary>More</summary>
|
|
221
|
+
|
|
216
222
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
217
223
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
218
224
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -224,7 +230,7 @@ Please scan the QR code below to join our community groups:
|
|
|
224
230
|
- 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
|
|
225
231
|
- 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
|
|
226
232
|
|
|
227
|
-
|
|
233
|
+
</details>
|
|
228
234
|
|
|
229
235
|
## 🛠️ Installation
|
|
230
236
|
### Method 1: Install Using pip
|
|
@@ -414,7 +420,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
|
|
|
414
420
|
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
415
421
|
|
|
416
422
|
|
|
417
|
-
## Model Serving Performance Evaluation
|
|
423
|
+
## 📈 Model Serving Performance Evaluation
|
|
418
424
|
A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
|
|
419
425
|
|
|
420
426
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
@@ -439,19 +445,32 @@ Speed Benchmark Results:
|
|
|
439
445
|
+---------------+-----------------+----------------+
|
|
440
446
|
```
|
|
441
447
|
|
|
442
|
-
## Custom Dataset Evaluation
|
|
448
|
+
## 🖊️ Custom Dataset Evaluation
|
|
443
449
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
444
450
|
|
|
445
451
|
|
|
446
|
-
## Arena Mode
|
|
452
|
+
## 🏟️ Arena Mode
|
|
447
453
|
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
448
454
|
|
|
449
455
|
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
450
456
|
|
|
457
|
+
## 👷♂️ Contribution
|
|
451
458
|
|
|
459
|
+
EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
|
|
452
460
|
|
|
461
|
+
<a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
|
|
462
|
+
<table>
|
|
463
|
+
<tr>
|
|
464
|
+
<th colspan="2">
|
|
465
|
+
<br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
|
|
466
|
+
</th>
|
|
467
|
+
</tr>
|
|
468
|
+
</table>
|
|
469
|
+
</a>
|
|
453
470
|
|
|
454
|
-
##
|
|
471
|
+
## 🔜 Roadmap
|
|
472
|
+
- [ ] Support for better evaluation report visualization
|
|
473
|
+
- [x] Support for mixed evaluations across multiple datasets
|
|
455
474
|
- [x] RAG evaluation
|
|
456
475
|
- [x] VLM evaluation
|
|
457
476
|
- [x] Agents evaluation
|
|
@@ -462,8 +481,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
|
|
|
462
481
|
- [ ] GAIA
|
|
463
482
|
- [ ] GPQA
|
|
464
483
|
- [x] MBPP
|
|
465
|
-
- [ ] Auto-reviewer
|
|
466
|
-
- [ ] Qwen-max
|
|
467
484
|
|
|
468
485
|
|
|
469
486
|
## Star History
|
|
@@ -24,14 +24,16 @@
|
|
|
24
24
|
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
25
25
|
|
|
26
26
|
## 📋 Contents
|
|
27
|
-
- [Introduction](
|
|
28
|
-
- [News](
|
|
29
|
-
- [Installation](
|
|
30
|
-
- [Quick Start](
|
|
27
|
+
- [Introduction](#-introduction)
|
|
28
|
+
- [News](#-news)
|
|
29
|
+
- [Installation](#️-installation)
|
|
30
|
+
- [Quick Start](#-quick-start)
|
|
31
31
|
- [Evaluation Backend](#evaluation-backend)
|
|
32
|
-
- [Custom Dataset Evaluation](
|
|
33
|
-
- [Model Serving Performance Evaluation](
|
|
34
|
-
- [Arena Mode](
|
|
32
|
+
- [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
33
|
+
- [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
34
|
+
- [Arena Mode](#-arena-mode)
|
|
35
|
+
- [Contribution](#️-contribution)
|
|
36
|
+
- [Roadmap](#-roadmap)
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
## 📝 Introduction
|
|
@@ -72,11 +74,15 @@ Please scan the QR code below to join our community groups:
|
|
|
72
74
|
|
|
73
75
|
|
|
74
76
|
## 🎉 News
|
|
77
|
+
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
75
78
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
76
79
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
77
80
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
78
81
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
79
82
|
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
83
|
+
|
|
84
|
+
<details><summary>More</summary>
|
|
85
|
+
|
|
80
86
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
81
87
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
82
88
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -88,7 +94,7 @@ Please scan the QR code below to join our community groups:
|
|
|
88
94
|
- 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
|
|
89
95
|
- 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
|
|
90
96
|
|
|
91
|
-
|
|
97
|
+
</details>
|
|
92
98
|
|
|
93
99
|
## 🛠️ Installation
|
|
94
100
|
### Method 1: Install Using pip
|
|
@@ -278,7 +284,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
|
|
|
278
284
|
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
279
285
|
|
|
280
286
|
|
|
281
|
-
## Model Serving Performance Evaluation
|
|
287
|
+
## 📈 Model Serving Performance Evaluation
|
|
282
288
|
A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
|
|
283
289
|
|
|
284
290
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
@@ -303,19 +309,32 @@ Speed Benchmark Results:
|
|
|
303
309
|
+---------------+-----------------+----------------+
|
|
304
310
|
```
|
|
305
311
|
|
|
306
|
-
## Custom Dataset Evaluation
|
|
312
|
+
## 🖊️ Custom Dataset Evaluation
|
|
307
313
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
308
314
|
|
|
309
315
|
|
|
310
|
-
## Arena Mode
|
|
316
|
+
## 🏟️ Arena Mode
|
|
311
317
|
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
312
318
|
|
|
313
319
|
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
314
320
|
|
|
321
|
+
## 👷♂️ Contribution
|
|
315
322
|
|
|
323
|
+
EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
|
|
316
324
|
|
|
325
|
+
<a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
|
|
326
|
+
<table>
|
|
327
|
+
<tr>
|
|
328
|
+
<th colspan="2">
|
|
329
|
+
<br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
|
|
330
|
+
</th>
|
|
331
|
+
</tr>
|
|
332
|
+
</table>
|
|
333
|
+
</a>
|
|
317
334
|
|
|
318
|
-
##
|
|
335
|
+
## 🔜 Roadmap
|
|
336
|
+
- [ ] Support for better evaluation report visualization
|
|
337
|
+
- [x] Support for mixed evaluations across multiple datasets
|
|
319
338
|
- [x] RAG evaluation
|
|
320
339
|
- [x] VLM evaluation
|
|
321
340
|
- [x] Agents evaluation
|
|
@@ -326,8 +345,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
|
|
|
326
345
|
- [ ] GAIA
|
|
327
346
|
- [ ] GPQA
|
|
328
347
|
- [x] MBPP
|
|
329
|
-
- [ ] Auto-reviewer
|
|
330
|
-
- [ ] Qwen-max
|
|
331
348
|
|
|
332
349
|
|
|
333
350
|
## Star History
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import json
|
|
3
3
|
|
|
4
|
+
from evalscope.constants import EvalBackend, EvalStage, EvalType
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
class ParseStrArgsAction(argparse.Action):
|
|
6
8
|
|
|
@@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
47
49
|
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
|
|
48
50
|
|
|
49
51
|
# Evaluation-related arguments
|
|
50
|
-
parser.add_argument('--eval-type', type=str, help='The type for evaluating.'
|
|
51
|
-
|
|
52
|
+
parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
|
|
53
|
+
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
|
|
54
|
+
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
|
|
55
|
+
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
|
|
52
56
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
53
|
-
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.'
|
|
57
|
+
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
58
|
+
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
54
59
|
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
|
|
55
60
|
|
|
56
61
|
# Cache and working directory arguments
|
|
@@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
62
67
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
63
68
|
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
64
69
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
70
|
+
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
71
|
+
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
65
72
|
# yapf: enable
|
|
66
73
|
|
|
67
74
|
|
|
@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
|
|
|
6
6
|
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_MODEL_REVISION
|
|
9
|
-
from evalscope.models
|
|
9
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class LLM:
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import glob
|
|
3
|
+
import importlib
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
|
|
7
|
+
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
8
|
+
from evalscope.utils import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
# Using glob to find all files matching the pattern
|
|
13
|
+
pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
|
|
14
|
+
files = glob.glob(pattern, recursive=False)
|
|
15
|
+
|
|
16
|
+
for file_path in files:
|
|
17
|
+
if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
|
|
18
|
+
# Convert file path to a module path
|
|
19
|
+
relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
|
|
20
|
+
module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
|
|
21
|
+
full_path = f'evalscope.benchmarks.{module_path}'
|
|
22
|
+
importlib.import_module(full_path)
|
|
23
|
+
# print(f'Importing {full_path}')
|
|
@@ -3,40 +3,35 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
|
-
from evalscope.benchmarks
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
6
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
+
from evalscope.constants import EvalType
|
|
8
|
+
from evalscope.metrics import WeightedAverageAccuracy, exact_match
|
|
9
|
+
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
+
from evalscope.utils import ResponseParser
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
# flake8: noqa
|
|
12
14
|
|
|
13
15
|
logger = get_logger()
|
|
14
16
|
|
|
15
|
-
DATASET_ID = 'modelscope/ai2_arc'
|
|
16
|
-
|
|
17
|
-
# task_list = ['ARC-Easy', 'ARC-Challenge']
|
|
18
|
-
SUBSET_LIST = ['ARC-Challenge']
|
|
19
|
-
|
|
20
17
|
|
|
18
|
+
@Benchmark.register(
|
|
19
|
+
name='arc',
|
|
20
|
+
dataset_id='modelscope/ai2_arc',
|
|
21
|
+
model_adapter=MultiChoiceModelAdapter,
|
|
22
|
+
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
|
+
metric_list=[WeightedAverageAccuracy],
|
|
24
|
+
few_shot_num=0,
|
|
25
|
+
train_split='train',
|
|
26
|
+
eval_split='test',
|
|
27
|
+
prompt_template='',
|
|
28
|
+
)
|
|
21
29
|
class ARCAdapter(DataAdapter):
|
|
22
30
|
|
|
23
31
|
choices = ['A', 'B', 'C', 'D']
|
|
24
32
|
|
|
25
|
-
def __init__(self,
|
|
26
|
-
|
|
27
|
-
metric_list: list = None,
|
|
28
|
-
few_shot_num: int = None,
|
|
29
|
-
train_split: str = 'train',
|
|
30
|
-
eval_split: str = 'test',
|
|
31
|
-
prompt_template: str = '',
|
|
32
|
-
**kwargs):
|
|
33
|
-
|
|
34
|
-
if subset_list is None:
|
|
35
|
-
subset_list = SUBSET_LIST
|
|
36
|
-
|
|
37
|
-
if metric_list is None:
|
|
38
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
39
|
-
|
|
33
|
+
def __init__(self, **kwargs):
|
|
34
|
+
few_shot_num = kwargs.get('few_shot_num', None)
|
|
40
35
|
if few_shot_num is None:
|
|
41
36
|
# Use 0-shot by default
|
|
42
37
|
logger.info(f'Set 0-shot examples by system for ARC.')
|
|
@@ -45,14 +40,7 @@ class ARCAdapter(DataAdapter):
|
|
|
45
40
|
if few_shot_num != 0:
|
|
46
41
|
logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
|
|
47
42
|
|
|
48
|
-
super().__init__(
|
|
49
|
-
subset_list=subset_list,
|
|
50
|
-
metric_list=metric_list,
|
|
51
|
-
few_shot_num=few_shot_num,
|
|
52
|
-
train_split=train_split,
|
|
53
|
-
eval_split=eval_split,
|
|
54
|
-
prompt_template=prompt_template,
|
|
55
|
-
**kwargs)
|
|
43
|
+
super().__init__(**kwargs)
|
|
56
44
|
|
|
57
45
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
58
46
|
"""
|
|
@@ -132,7 +120,7 @@ class ARCAdapter(DataAdapter):
|
|
|
132
120
|
# Get the gold choice
|
|
133
121
|
return input_d.get('answerKey', '')
|
|
134
122
|
|
|
135
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
123
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
136
124
|
"""
|
|
137
125
|
Parse the model output to get the answer. Could be the best choice index.
|
|
138
126
|
|
|
@@ -144,12 +132,12 @@ class ARCAdapter(DataAdapter):
|
|
|
144
132
|
Returns:
|
|
145
133
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
146
134
|
"""
|
|
147
|
-
if eval_type ==
|
|
135
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
148
136
|
return result
|
|
149
|
-
elif eval_type ==
|
|
137
|
+
elif eval_type == EvalType.SERVICE:
|
|
150
138
|
return ResponseParser.parse_first_option_with_choices(
|
|
151
139
|
text=result, options=self.choices) # TODO: to be checked !
|
|
152
|
-
elif eval_type ==
|
|
140
|
+
elif eval_type == EvalType.CUSTOM:
|
|
153
141
|
return ResponseParser.parse_first_option_with_choices(
|
|
154
142
|
text=result, options=self.choices) # TODO: to be checked !
|
|
155
143
|
else:
|
|
@@ -158,70 +146,6 @@ class ARCAdapter(DataAdapter):
|
|
|
158
146
|
def match(self, gold: str, pred: str) -> float:
|
|
159
147
|
return exact_match(gold=gold, pred=pred)
|
|
160
148
|
|
|
161
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
162
|
-
"""
|
|
163
|
-
Compute evaluation result by specific metric.
|
|
164
|
-
|
|
165
|
-
Args:
|
|
166
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
167
|
-
|
|
168
|
-
Returns:
|
|
169
|
-
The metric score.
|
|
170
|
-
"""
|
|
171
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
172
|
-
return weighted_mean(items)
|
|
173
|
-
|
|
174
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
175
|
-
"""
|
|
176
|
-
Generate the report for the model output.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
180
|
-
report_name: The user-defined report name.
|
|
181
|
-
|
|
182
|
-
Returns: A dict of metric calculation results. The format is like:
|
|
183
|
-
{
|
|
184
|
-
"name":"ARC",
|
|
185
|
-
"metric":"WeightedAverageAccuracy",
|
|
186
|
-
"score":0.3389,
|
|
187
|
-
"category":[
|
|
188
|
-
{
|
|
189
|
-
"name":"DEFAULT",
|
|
190
|
-
"score":0.4128,
|
|
191
|
-
"subset":[
|
|
192
|
-
{
|
|
193
|
-
"name":"ARC-Easy",
|
|
194
|
-
"score":0.5632
|
|
195
|
-
},
|
|
196
|
-
{
|
|
197
|
-
"name":"ARC-Challenge",
|
|
198
|
-
"score":0.3157
|
|
199
|
-
}
|
|
200
|
-
]
|
|
201
|
-
}
|
|
202
|
-
],
|
|
203
|
-
"total_num":7800
|
|
204
|
-
}
|
|
205
|
-
"""
|
|
206
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
207
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
208
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
209
|
-
cate_avg_list = [{
|
|
210
|
-
'name': subset_name,
|
|
211
|
-
'score': normalize_score(score=score)
|
|
212
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
213
|
-
|
|
214
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
215
|
-
|
|
216
|
-
res_map = dict(
|
|
217
|
-
name=report_name or 'arc',
|
|
218
|
-
metric=self.metric_list[0]['name'],
|
|
219
|
-
score=weighted_avg_acc,
|
|
220
|
-
category=[category_d],
|
|
221
|
-
total_num=total_num)
|
|
222
|
-
|
|
223
|
-
return res_map
|
|
224
|
-
|
|
225
149
|
@classmethod
|
|
226
150
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
227
151
|
|
|
@@ -5,18 +5,17 @@ import os
|
|
|
5
5
|
import random
|
|
6
6
|
import re
|
|
7
7
|
|
|
8
|
-
from evalscope.benchmarks
|
|
8
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
|
-
from evalscope.metrics
|
|
11
|
-
from evalscope.
|
|
10
|
+
from evalscope.metrics import WeightedAverageAccuracy, exact_match
|
|
11
|
+
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
|
+
from evalscope.utils import ResponseParser
|
|
12
13
|
from evalscope.utils.logger import get_logger
|
|
13
14
|
|
|
14
15
|
# flake8: noqa
|
|
15
16
|
|
|
16
17
|
logger = get_logger()
|
|
17
18
|
|
|
18
|
-
DATASET_ID = 'modelscope/bbh'
|
|
19
|
-
|
|
20
19
|
# BBH multiple choice subset list
|
|
21
20
|
MULTIPLE_CHOICE = 'multiple_choice'
|
|
22
21
|
MULTIPLE_CHOICE_LIST = [
|
|
@@ -59,41 +58,32 @@ TASK_TYPE = 'task_type'
|
|
|
59
58
|
SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
60
59
|
|
|
61
60
|
|
|
61
|
+
@Benchmark.register(
|
|
62
|
+
name='bbh',
|
|
63
|
+
dataset_id='modelscope/bbh',
|
|
64
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
65
|
+
subset_list=SUBSET_LIST,
|
|
66
|
+
metric_list=[WeightedAverageAccuracy],
|
|
67
|
+
few_shot_num=3,
|
|
68
|
+
train_split=None,
|
|
69
|
+
eval_split='test',
|
|
70
|
+
prompt_template='',
|
|
71
|
+
)
|
|
62
72
|
class BBHAdapter(DataAdapter):
|
|
63
73
|
"""
|
|
64
74
|
Adapter for BBH free-form and multiple-choices sub-tasks.
|
|
65
75
|
"""
|
|
66
76
|
|
|
67
|
-
def __init__(self,
|
|
68
|
-
subset_list: list = None,
|
|
69
|
-
metric_list: list = None,
|
|
70
|
-
few_shot_num: int = None,
|
|
71
|
-
train_split: str = None,
|
|
72
|
-
eval_split: str = 'test',
|
|
73
|
-
**kwargs):
|
|
74
|
-
|
|
75
|
-
if subset_list is None:
|
|
76
|
-
subset_list = SUBSET_LIST
|
|
77
|
+
def __init__(self, **kwargs):
|
|
77
78
|
|
|
78
|
-
|
|
79
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
80
|
-
|
|
81
|
-
if few_shot_num is None:
|
|
82
|
-
logger.info(f'Set 3-shot examples by system for BBH.')
|
|
83
|
-
few_shot_num = 3
|
|
79
|
+
few_shot_num = kwargs.get('few_shot_num', 3)
|
|
84
80
|
|
|
85
81
|
if few_shot_num != 3 and few_shot_num != 0:
|
|
86
82
|
logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
|
|
87
83
|
f'Use 3-shot by default.')
|
|
88
|
-
few_shot_num = 3
|
|
84
|
+
kwargs['few_shot_num'] = 3
|
|
89
85
|
|
|
90
|
-
super().__init__(
|
|
91
|
-
subset_list=subset_list,
|
|
92
|
-
metric_list=metric_list,
|
|
93
|
-
few_shot_num=few_shot_num,
|
|
94
|
-
train_split=train_split,
|
|
95
|
-
eval_split=eval_split,
|
|
96
|
-
**kwargs)
|
|
86
|
+
super().__init__(**kwargs)
|
|
97
87
|
|
|
98
88
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
99
89
|
data_dict = {}
|
|
@@ -217,66 +207,6 @@ class BBHAdapter(DataAdapter):
|
|
|
217
207
|
def match(self, gold: str, pred: str) -> float:
|
|
218
208
|
return exact_match(gold=gold, pred=pred)
|
|
219
209
|
|
|
220
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
221
|
-
"""
|
|
222
|
-
Compute evaluation result by specific metric.
|
|
223
|
-
|
|
224
|
-
Args:
|
|
225
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
The metric score.
|
|
229
|
-
"""
|
|
230
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
231
|
-
return weighted_mean(items)
|
|
232
|
-
|
|
233
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
234
|
-
"""
|
|
235
|
-
Generate the report for the model output.
|
|
236
|
-
|
|
237
|
-
Args:
|
|
238
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
239
|
-
report_name: The user-defined report name.
|
|
240
|
-
|
|
241
|
-
Returns: A dict of metric calculation results. The format is like:
|
|
242
|
-
{
|
|
243
|
-
"name":"BBH",
|
|
244
|
-
"metric":"WeightedAverageAccuracy",
|
|
245
|
-
"score":0.3389,
|
|
246
|
-
"category":[
|
|
247
|
-
{
|
|
248
|
-
"name":"DEFAULT",
|
|
249
|
-
"score":0.3389,
|
|
250
|
-
"subset":[
|
|
251
|
-
{
|
|
252
|
-
"name":"BBH",
|
|
253
|
-
"score":0.3389
|
|
254
|
-
},
|
|
255
|
-
]
|
|
256
|
-
}
|
|
257
|
-
],
|
|
258
|
-
"total_num":100
|
|
259
|
-
}
|
|
260
|
-
"""
|
|
261
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
262
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
263
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
264
|
-
cate_avg_list = [{
|
|
265
|
-
'name': subset_name,
|
|
266
|
-
'score': normalize_score(score=score)
|
|
267
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
268
|
-
|
|
269
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
270
|
-
|
|
271
|
-
res_map = dict(
|
|
272
|
-
name=report_name or 'bbh',
|
|
273
|
-
metric=self.metric_list[0]['name'],
|
|
274
|
-
score=weighted_avg_acc,
|
|
275
|
-
category=[category_d],
|
|
276
|
-
total_num=total_num)
|
|
277
|
-
|
|
278
|
-
return res_map
|
|
279
|
-
|
|
280
210
|
@classmethod
|
|
281
211
|
def _extract_mc_answer(cls, ans: str) -> str:
|
|
282
212
|
"""
|