evalscope 0.13.0__tar.gz → 0.13.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.13.0/evalscope.egg-info → evalscope-0.13.1}/PKG-INFO +33 -30
- {evalscope-0.13.0 → evalscope-0.13.1}/README.md +32 -29
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/collections/evaluator.py +1 -1
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/config.py +5 -2
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/constants.py +1 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/evaluator.py +5 -4
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/custom_adapter.py +1 -1
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/arguments.py +11 -40
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/benchmark.py +34 -28
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/main.py +1 -1
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/__init__.py +1 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/openqa.py +6 -11
- evalscope-0.13.1/evalscope/perf/plugin/datasets/random_dataset.py +51 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/db_util.py +3 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/run.py +14 -2
- evalscope-0.13.1/evalscope/version.py +4 -0
- {evalscope-0.13.0 → evalscope-0.13.1/evalscope.egg-info}/PKG-INFO +33 -30
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/SOURCES.txt +1 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/cli/test_run.py +41 -11
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/perf/test_perf.py +23 -0
- evalscope-0.13.0/evalscope/version.py +0 -4
- {evalscope-0.13.0 → evalscope-0.13.1}/LICENSE +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/MANIFEST.in +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/arguments.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/general_mcq/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gpqa/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/iquiz/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/execute_utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/math_500/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/musr/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/process_bench/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/simple_qa/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/benchmarks/utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/collections/schema.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/llm_judge.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/math_parser.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/base_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/chat_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/choice_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/local_model.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/model.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/register.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/models/server_adapter.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/benchmark_util.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/app.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/combinator.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/generator.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/report/utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/run_arena.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/summarizer.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/filters.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/io_utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/logger.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope/utils/utils.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/requires.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/app.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/docs.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/framework.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/inner.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/opencompass.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/perf.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/rag.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/tests.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements/vlmeval.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/requirements.txt +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/setup.cfg +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/setup.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/cli/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/cli/test_all.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/cli/test_collection.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/perf/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/rag/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/swift/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/test_run_all.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/vlm/__init__.py +0 -0
- {evalscope-0.13.0 → evalscope-0.13.1}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -239,7 +239,8 @@ Please scan the QR code below to join our community groups:
|
|
|
239
239
|
|
|
240
240
|
## 🎉 News
|
|
241
241
|
|
|
242
|
-
- 🔥 **[2025.03.
|
|
242
|
+
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
243
|
+
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
243
244
|
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
244
245
|
- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
|
|
245
246
|
- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
|
|
@@ -277,23 +278,24 @@ Please scan the QR code below to join our community groups:
|
|
|
277
278
|
We recommend using conda to manage your environment and installing dependencies with pip:
|
|
278
279
|
|
|
279
280
|
1. Create a conda environment (optional)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
281
|
+
```shell
|
|
282
|
+
# It is recommended to use Python 3.10
|
|
283
|
+
conda create -n evalscope python=3.10
|
|
284
|
+
# Activate the conda environment
|
|
285
|
+
conda activate evalscope
|
|
286
|
+
```
|
|
286
287
|
|
|
287
288
|
2. Install dependencies using pip
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
289
|
+
```shell
|
|
290
|
+
pip install evalscope # Install Native backend (default)
|
|
291
|
+
# Additional options
|
|
292
|
+
pip install 'evalscope[opencompass]' # Install OpenCompass backend
|
|
293
|
+
pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
|
|
294
|
+
pip install 'evalscope[rag]' # Install RAGEval backend
|
|
295
|
+
pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
|
|
296
|
+
pip install 'evalscope[app]' # Install dependencies for visualization
|
|
297
|
+
pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
298
|
+
```
|
|
297
299
|
|
|
298
300
|
> [!WARNING]
|
|
299
301
|
> As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
|
|
@@ -307,21 +309,22 @@ We recommend using conda to manage your environment and installing dependencies
|
|
|
307
309
|
|
|
308
310
|
### Method 2: Install from Source
|
|
309
311
|
1. Download the source code
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
312
|
+
```shell
|
|
313
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
314
|
+
```
|
|
313
315
|
|
|
314
316
|
2. Install dependencies
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
317
|
+
```shell
|
|
318
|
+
cd evalscope/
|
|
319
|
+
pip install -e . # Install Native backend
|
|
320
|
+
# Additional options
|
|
321
|
+
pip install -e '.[opencompass]' # Install OpenCompass backend
|
|
322
|
+
pip install -e '.[vlmeval]' # Install VLMEvalKit backend
|
|
323
|
+
pip install -e '.[rag]' # Install RAGEval backend
|
|
324
|
+
pip install -e '.[perf]' # Install Perf dependencies
|
|
325
|
+
pip install -e '.[app]' # Install visualization dependencies
|
|
326
|
+
pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
327
|
+
```
|
|
325
328
|
|
|
326
329
|
|
|
327
330
|
## 🚀 Quick Start
|
|
@@ -88,7 +88,8 @@ Please scan the QR code below to join our community groups:
|
|
|
88
88
|
|
|
89
89
|
## 🎉 News
|
|
90
90
|
|
|
91
|
-
- 🔥 **[2025.03.
|
|
91
|
+
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
92
|
+
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
92
93
|
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
93
94
|
- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
|
|
94
95
|
- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
|
|
@@ -126,23 +127,24 @@ Please scan the QR code below to join our community groups:
|
|
|
126
127
|
We recommend using conda to manage your environment and installing dependencies with pip:
|
|
127
128
|
|
|
128
129
|
1. Create a conda environment (optional)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
130
|
+
```shell
|
|
131
|
+
# It is recommended to use Python 3.10
|
|
132
|
+
conda create -n evalscope python=3.10
|
|
133
|
+
# Activate the conda environment
|
|
134
|
+
conda activate evalscope
|
|
135
|
+
```
|
|
135
136
|
|
|
136
137
|
2. Install dependencies using pip
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
138
|
+
```shell
|
|
139
|
+
pip install evalscope # Install Native backend (default)
|
|
140
|
+
# Additional options
|
|
141
|
+
pip install 'evalscope[opencompass]' # Install OpenCompass backend
|
|
142
|
+
pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
|
|
143
|
+
pip install 'evalscope[rag]' # Install RAGEval backend
|
|
144
|
+
pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
|
|
145
|
+
pip install 'evalscope[app]' # Install dependencies for visualization
|
|
146
|
+
pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
147
|
+
```
|
|
146
148
|
|
|
147
149
|
> [!WARNING]
|
|
148
150
|
> As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
|
|
@@ -156,21 +158,22 @@ We recommend using conda to manage your environment and installing dependencies
|
|
|
156
158
|
|
|
157
159
|
### Method 2: Install from Source
|
|
158
160
|
1. Download the source code
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
161
|
+
```shell
|
|
162
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
163
|
+
```
|
|
162
164
|
|
|
163
165
|
2. Install dependencies
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
166
|
+
```shell
|
|
167
|
+
cd evalscope/
|
|
168
|
+
pip install -e . # Install Native backend
|
|
169
|
+
# Additional options
|
|
170
|
+
pip install -e '.[opencompass]' # Install OpenCompass backend
|
|
171
|
+
pip install -e '.[vlmeval]' # Install VLMEvalKit backend
|
|
172
|
+
pip install -e '.[rag]' # Install RAGEval backend
|
|
173
|
+
pip install -e '.[perf]' # Install Perf dependencies
|
|
174
|
+
pip install -e '.[app]' # Install visualization dependencies
|
|
175
|
+
pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
176
|
+
```
|
|
174
177
|
|
|
175
178
|
|
|
176
179
|
## 🚀 Quick Start
|
|
@@ -181,7 +181,7 @@ class EvaluatorCollection:
|
|
|
181
181
|
answers_list = jsonl_to_list(pred_file_path)
|
|
182
182
|
indices = set()
|
|
183
183
|
for answer in answers_list:
|
|
184
|
-
index = answer
|
|
184
|
+
index = answer.get(AnswerKeys.INDEX)
|
|
185
185
|
answer_dict[index] = answer
|
|
186
186
|
indices.add(index)
|
|
187
187
|
data = []
|
|
@@ -81,7 +81,7 @@ class TaskConfig:
|
|
|
81
81
|
def __post_init__(self):
|
|
82
82
|
if (not self.model_id) and self.model:
|
|
83
83
|
if isinstance(self.model, CustomModel):
|
|
84
|
-
self.model_id =
|
|
84
|
+
self.model_id = self.model.config.get('model_id', 'custom_model')
|
|
85
85
|
else:
|
|
86
86
|
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
87
87
|
# fix path error, see http://github.com/modelscope/evalscope/issues/377
|
|
@@ -92,7 +92,10 @@ class TaskConfig:
|
|
|
92
92
|
self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
|
|
93
93
|
|
|
94
94
|
def to_dict(self):
|
|
95
|
-
|
|
95
|
+
result = self.__dict__.copy()
|
|
96
|
+
if isinstance(self.model, CustomModel):
|
|
97
|
+
result['model'] = self.model.__class__.__name__
|
|
98
|
+
return result
|
|
96
99
|
|
|
97
100
|
def __str__(self):
|
|
98
101
|
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
@@ -81,7 +81,7 @@ class Evaluator(object):
|
|
|
81
81
|
for subset_name, prompts_list in prompts.items():
|
|
82
82
|
limit = self.task_cfg.limit or len(prompts_list)
|
|
83
83
|
for index, prompt in enumerate(prompts_list[:limit]):
|
|
84
|
-
prompt[
|
|
84
|
+
prompt[AnswerKeys.INDEX] = index
|
|
85
85
|
limited_prompts[subset_name].append(prompt)
|
|
86
86
|
|
|
87
87
|
return limited_prompts
|
|
@@ -97,7 +97,8 @@ class Evaluator(object):
|
|
|
97
97
|
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
98
98
|
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
99
99
|
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
100
|
-
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
100
|
+
# answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
101
|
+
answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
|
|
101
102
|
return answer_d
|
|
102
103
|
|
|
103
104
|
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
@@ -117,7 +118,7 @@ class Evaluator(object):
|
|
|
117
118
|
return answers_list, prompts_list
|
|
118
119
|
|
|
119
120
|
def get_answered_indices(answers_list: List[Dict]) -> List[int]:
|
|
120
|
-
indices = [answer
|
|
121
|
+
indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
|
|
121
122
|
|
|
122
123
|
if all(index is None for index in indices):
|
|
123
124
|
return list(range(len(answers_list)))
|
|
@@ -238,7 +239,7 @@ class Evaluator(object):
|
|
|
238
239
|
pred = pred_content
|
|
239
240
|
|
|
240
241
|
choice[ReviewKeys.REVIEW] = {
|
|
241
|
-
ReviewKeys.GOLD: gold_content,
|
|
242
|
+
ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
|
|
242
243
|
ReviewKeys.PRED: pred,
|
|
243
244
|
ReviewKeys.RESULT: review_result
|
|
244
245
|
}
|
|
@@ -66,4 +66,4 @@ class CustomModelAdapter(BaseModelAdapter):
|
|
|
66
66
|
else:
|
|
67
67
|
raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
|
|
68
68
|
|
|
69
|
-
return self.custom_model.predict(prompts=in_prompts, **kwargs)
|
|
69
|
+
return self.custom_model.predict(prompts=in_prompts, origin_inputs=inputs, **kwargs)
|
|
@@ -24,6 +24,7 @@ class Arguments:
|
|
|
24
24
|
connect_timeout: int = 600 # Connection timeout in seconds
|
|
25
25
|
read_timeout: int = 600 # Read timeout in seconds
|
|
26
26
|
api_key: Optional[str] = None
|
|
27
|
+
no_test_connection: bool = False # Test the connection before starting the benchmark
|
|
27
28
|
|
|
28
29
|
# Performance and parallelism
|
|
29
30
|
number: Optional[int] = None # Number of requests to be made
|
|
@@ -40,8 +41,9 @@ class Arguments:
|
|
|
40
41
|
outputs_dir: str = DEFAULT_WORK_DIR
|
|
41
42
|
|
|
42
43
|
# Prompt settings
|
|
43
|
-
max_prompt_length: int =
|
|
44
|
+
max_prompt_length: int = 131072 # Maximum length of the prompt
|
|
44
45
|
min_prompt_length: int = 0 # Minimum length of the prompt
|
|
46
|
+
prefix_length: int = 0 # Length of the prefix, only for random dataset
|
|
45
47
|
prompt: Optional[str] = None # The prompt text
|
|
46
48
|
query_template: Optional[str] = None # Template for the query
|
|
47
49
|
|
|
@@ -65,44 +67,12 @@ class Arguments:
|
|
|
65
67
|
|
|
66
68
|
@staticmethod
|
|
67
69
|
def from_args(args):
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
connect_timeout=args.connect_timeout,
|
|
75
|
-
read_timeout=args.read_timeout,
|
|
76
|
-
number=args.number,
|
|
77
|
-
parallel=args.parallel,
|
|
78
|
-
rate=args.rate,
|
|
79
|
-
log_every_n_query=args.log_every_n_query,
|
|
80
|
-
headers=args.headers,
|
|
81
|
-
wandb_api_key=args.wandb_api_key,
|
|
82
|
-
name=args.name,
|
|
83
|
-
outputs_dir=args.outputs_dir,
|
|
84
|
-
debug=args.debug,
|
|
85
|
-
tokenizer_path=args.tokenizer_path,
|
|
86
|
-
api=args.api,
|
|
87
|
-
max_prompt_length=args.max_prompt_length,
|
|
88
|
-
min_prompt_length=args.min_prompt_length,
|
|
89
|
-
prompt=args.prompt,
|
|
90
|
-
query_template=args.query_template,
|
|
91
|
-
dataset=args.dataset,
|
|
92
|
-
dataset_path=args.dataset_path,
|
|
93
|
-
frequency_penalty=args.frequency_penalty,
|
|
94
|
-
logprobs=args.logprobs,
|
|
95
|
-
max_tokens=args.max_tokens,
|
|
96
|
-
min_tokens=args.min_tokens,
|
|
97
|
-
n_choices=args.n_choices,
|
|
98
|
-
seed=args.seed,
|
|
99
|
-
stop=args.stop,
|
|
100
|
-
stop_token_ids=args.stop_token_ids,
|
|
101
|
-
stream=args.stream,
|
|
102
|
-
temperature=args.temperature,
|
|
103
|
-
top_p=args.top_p,
|
|
104
|
-
top_k=args.top_k,
|
|
105
|
-
)
|
|
70
|
+
# Convert Namespace to a dictionary and filter out None values
|
|
71
|
+
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
72
|
+
|
|
73
|
+
if 'func' in args_dict:
|
|
74
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
75
|
+
return Arguments(**args_dict)
|
|
106
76
|
|
|
107
77
|
def __post_init__(self):
|
|
108
78
|
self.headers = self.headers or {} # Default to empty dictionary
|
|
@@ -153,6 +123,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
153
123
|
parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
|
|
154
124
|
parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
|
|
155
125
|
parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
|
|
126
|
+
parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
|
|
156
127
|
|
|
157
128
|
# Performance and parallelism
|
|
158
129
|
parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
|
|
@@ -168,6 +139,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
168
139
|
# Prompt settings
|
|
169
140
|
parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
|
|
170
141
|
parser.add_argument('--min-prompt-length', type=int, default=0, help='Minimum input prompt length')
|
|
142
|
+
parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
|
|
171
143
|
parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
|
|
172
144
|
parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
|
|
173
145
|
|
|
@@ -193,7 +165,6 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
193
165
|
parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
|
|
194
166
|
parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
|
|
195
167
|
parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
|
|
196
|
-
|
|
197
168
|
# yapf: enable
|
|
198
169
|
|
|
199
170
|
|
|
@@ -150,39 +150,45 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
150
150
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
151
151
|
wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
|
|
152
152
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
153
|
+
collected_benchmark_data = []
|
|
154
|
+
|
|
155
|
+
with tqdm(desc='Processing') as pbar:
|
|
156
|
+
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
157
|
+
try:
|
|
158
|
+
# Attempt to get benchmark data from the queue with a timeout
|
|
159
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
160
|
+
benchmark_data_queue.task_done()
|
|
161
|
+
except asyncio.TimeoutError:
|
|
162
|
+
# If timeout, continue to the next iteration
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# Update metrics based on the benchmark data
|
|
166
|
+
metrics.update_metrics(benchmark_data, api_plugin)
|
|
165
167
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
+
# Collect benchmark data for later database insertion
|
|
169
|
+
collected_benchmark_data.append(benchmark_data)
|
|
168
170
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
con.commit()
|
|
171
|
+
# Create a message with the updated metrics
|
|
172
|
+
message = metrics.create_message()
|
|
172
173
|
|
|
173
|
-
|
|
174
|
-
|
|
174
|
+
# Log the message to wandb if the api key is provided
|
|
175
|
+
if args.wandb_api_key:
|
|
176
|
+
wandb.log(message)
|
|
175
177
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
178
|
+
# Log the message to the logger every n queries
|
|
179
|
+
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
180
|
+
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
181
|
+
logger.info(msg)
|
|
179
182
|
|
|
180
|
-
|
|
181
|
-
if int(metrics.n_total_queries) % args.log_every_n_query == 0:
|
|
182
|
-
msg = json.dumps(message, ensure_ascii=False, indent=2)
|
|
183
|
-
logger.info(msg)
|
|
183
|
+
pbar.update(1) # Update the progress bar
|
|
184
184
|
|
|
185
|
-
|
|
185
|
+
# Now perform database operations after all benchmark data has been processed
|
|
186
|
+
with sqlite3.connect(result_db_path) as con:
|
|
187
|
+
cursor = con.cursor()
|
|
188
|
+
create_result_table(cursor)
|
|
189
|
+
for benchmark_data in collected_benchmark_data:
|
|
190
|
+
insert_benchmark_data(cursor, benchmark_data)
|
|
191
|
+
con.commit()
|
|
186
192
|
|
|
187
193
|
return metrics, result_db_path
|
|
188
194
|
|
|
@@ -199,7 +205,7 @@ async def start_server(args: Arguments) -> bool:
|
|
|
199
205
|
else:
|
|
200
206
|
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
201
207
|
|
|
202
|
-
if not await test_connection(args):
|
|
208
|
+
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
203
209
|
raise TimeoutError('Test connection failed')
|
|
204
210
|
|
|
205
211
|
|
|
@@ -32,7 +32,7 @@ def run_perf_benchmark(args):
|
|
|
32
32
|
if platform.system() == 'Windows':
|
|
33
33
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
34
34
|
|
|
35
|
-
loop = asyncio.
|
|
35
|
+
loop = asyncio.new_event_loop()
|
|
36
36
|
if platform.system() != 'Windows':
|
|
37
37
|
add_signal_handlers(loop)
|
|
38
38
|
|
|
@@ -3,4 +3,5 @@ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
|
|
|
3
3
|
from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
|
|
4
4
|
from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
|
|
5
5
|
from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
|
|
6
|
+
from evalscope.perf.plugin.datasets.random_dataset import RandomDatasetPlugin
|
|
6
7
|
from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import
|
|
2
|
+
import os
|
|
3
3
|
from typing import Any, Dict, Iterator, List
|
|
4
4
|
|
|
5
5
|
from evalscope.perf.arguments import Arguments
|
|
@@ -18,16 +18,11 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
18
18
|
|
|
19
19
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
20
20
|
if not self.query_parameters.dataset_path:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
'open_qa.jsonl',
|
|
27
|
-
'--local_dir',
|
|
28
|
-
'./data',
|
|
29
|
-
])
|
|
30
|
-
self.query_parameters.dataset_path = './data/open_qa.jsonl'
|
|
21
|
+
from modelscope import dataset_snapshot_download
|
|
22
|
+
|
|
23
|
+
file_name = 'open_qa.jsonl'
|
|
24
|
+
local_path = dataset_snapshot_download('AI-ModelScope/HC3-Chinese', allow_patterns=[file_name])
|
|
25
|
+
self.query_parameters.dataset_path = os.path.join(local_path, file_name)
|
|
31
26
|
|
|
32
27
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
33
28
|
item = json.loads(item)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Dict, Iterator, List
|
|
3
|
+
|
|
4
|
+
from evalscope.perf.arguments import Arguments
|
|
5
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
6
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_dataset('random')
|
|
10
|
+
class RandomDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, query_parameters: Arguments):
|
|
15
|
+
super().__init__(query_parameters)
|
|
16
|
+
assert self.query_parameters.tokenizer_path, 'Tokenizer path is required for random data generation, please provide it with `--tokenizer_path`.' # noqa: E501
|
|
17
|
+
|
|
18
|
+
from modelscope import AutoTokenizer
|
|
19
|
+
self.tokenizer = AutoTokenizer.from_pretrained(self.query_parameters.tokenizer_path, trust_remote_code=True)
|
|
20
|
+
self.prefix_length = self.query_parameters.prefix_length
|
|
21
|
+
self.prefix_ids = self.get_random_inputs(self.prefix_length)
|
|
22
|
+
self.template_len = self.get_template_len()
|
|
23
|
+
self.number = self.query_parameters.number or 1
|
|
24
|
+
|
|
25
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
26
|
+
min_prompt_length = self.query_parameters.min_prompt_length - self.template_len
|
|
27
|
+
max_prompt_length = self.query_parameters.max_prompt_length - self.template_len + 1
|
|
28
|
+
|
|
29
|
+
assert min_prompt_length >= 0, f'min_prompt_length should be greater than or equal to the template length {self.template_len}.' # noqa: E501
|
|
30
|
+
assert max_prompt_length >= min_prompt_length, 'max_prompt_length should be greater than or equal to min_prompt_length.' # noqa: E501
|
|
31
|
+
|
|
32
|
+
# refer to https://github.com/vllm-project/vllm/blob/ed6e9075d31e32c8548b480a47d1ffb77da1f54c/benchmarks/benchmark_serving.py#L366C1-L399C1 # noqa: E501
|
|
33
|
+
input_lens = np.random.randint(min_prompt_length, max_prompt_length, size=self.number)
|
|
34
|
+
offsets = np.random.randint(0, self.tokenizer.vocab_size, size=self.number)
|
|
35
|
+
|
|
36
|
+
for i in range(self.number):
|
|
37
|
+
prompt_ids = (offsets[i] + i + np.arange(input_lens[i])) % self.tokenizer.vocab_size
|
|
38
|
+
prompt = self.tokenizer.decode(
|
|
39
|
+
self.prefix_ids + prompt_ids.tolist(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
|
|
40
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
41
|
+
|
|
42
|
+
def get_random_inputs(self, length: int) -> List[int]:
|
|
43
|
+
if length <= 0:
|
|
44
|
+
return []
|
|
45
|
+
input_ids = np.random.randint(0, self.tokenizer.vocab_size, size=length).tolist()
|
|
46
|
+
return input_ids
|
|
47
|
+
|
|
48
|
+
def get_template_len(self):
|
|
49
|
+
empty_message = [{'role': 'user', 'content': ''}]
|
|
50
|
+
template = self.tokenizer.apply_chat_template(empty_message, tokenize=True, add_generation_prompt=True)
|
|
51
|
+
return len(template)
|
|
@@ -2,6 +2,7 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import pickle
|
|
5
|
+
import re
|
|
5
6
|
import sqlite3
|
|
6
7
|
import sys
|
|
7
8
|
from datetime import datetime
|
|
@@ -91,6 +92,8 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
91
92
|
def get_output_path(args: Arguments) -> str:
|
|
92
93
|
current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
93
94
|
output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
|
|
95
|
+
# Filter illegal characters
|
|
96
|
+
output_path = re.sub(r'[<>:"|?*]', '_', output_path)
|
|
94
97
|
if not os.path.exists(output_path):
|
|
95
98
|
os.makedirs(output_path, exist_ok=True)
|
|
96
99
|
logger.info(f'Save the result to: {output_path}')
|
|
@@ -39,9 +39,11 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
39
39
|
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
40
40
|
|
|
41
41
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
42
|
-
|
|
42
|
+
result = run_non_native_backend(task_cfg, outputs)
|
|
43
43
|
else:
|
|
44
|
-
|
|
44
|
+
result = evaluate_model(task_cfg, outputs)
|
|
45
|
+
|
|
46
|
+
return result
|
|
45
47
|
|
|
46
48
|
|
|
47
49
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
@@ -117,6 +119,16 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
117
119
|
res_dict = evaluator.eval()
|
|
118
120
|
eval_results[evaluator.dataset_name] = res_dict
|
|
119
121
|
|
|
122
|
+
# Clean up
|
|
123
|
+
if base_model is not None:
|
|
124
|
+
import gc
|
|
125
|
+
import torch
|
|
126
|
+
|
|
127
|
+
del base_model
|
|
128
|
+
del evaluators
|
|
129
|
+
torch.cuda.empty_cache()
|
|
130
|
+
gc.collect()
|
|
131
|
+
|
|
120
132
|
return eval_results
|
|
121
133
|
|
|
122
134
|
|