evalscope 0.15.0__tar.gz → 0.15.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.15.0/evalscope.egg-info → evalscope-0.15.1}/PKG-INFO +5 -5
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +3 -3
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/evaluator/evaluator.py +7 -1
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/adapters/chat_adapter.py +3 -3
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/benchmark.py +4 -3
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/main.py +4 -2
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/utils/db_util.py +8 -6
- evalscope-0.15.1/evalscope/version.py +4 -0
- {evalscope-0.15.0 → evalscope-0.15.1/evalscope.egg-info}/PKG-INFO +5 -5
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope.egg-info/requires.txt +4 -4
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements/aigc.txt +1 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements/framework.txt +1 -2
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/aigc/test_t2i.py +4 -4
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/cli/test_run.py +12 -5
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/perf/test_perf.py +4 -2
- evalscope-0.15.0/evalscope/version.py +0 -4
- {evalscope-0.15.0 → evalscope-0.15.1}/LICENSE +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/MANIFEST.in +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/README.md +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/arguments.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/arena_hard/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/general_mcq/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/gpqa/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ifeval/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/iquiz/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/math_500/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/musr/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/process_bench/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/simple_qa/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/collections/evaluator.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/collections/schema.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/config.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/constants.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/llm_judge.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/math_parser.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/adapters/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/adapters/base_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/adapters/choice_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/adapters/custom_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/adapters/server_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/adapters/t2i_adapter.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/local_model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/model.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/models/register.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/arguments.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/utils/benchmark_util.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/perf/utils/log_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/report/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/report/app.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/report/app_arguments.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/report/combinator.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/report/generator.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/report/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/run.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/run_arena.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/summarizer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/filters.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/import_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/io_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/logger.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope/utils/utils.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope.egg-info/SOURCES.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements/app.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements/docs.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements/opencompass.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements/perf.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements/rag.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements/vlmeval.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/requirements.txt +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/setup.cfg +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/setup.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/aigc/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/cli/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/cli/test_all.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/cli/test_collection.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/perf/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/rag/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/swift/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/test_run_all.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/vlm/__init__.py +0 -0
- {evalscope-0.15.0 → evalscope-0.15.1}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.15.
|
|
3
|
+
Version: 0.15.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
|
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
29
|
-
Requires-Dist: omegaconf
|
|
30
29
|
Requires-Dist: openai
|
|
31
30
|
Requires-Dist: pandas
|
|
32
31
|
Requires-Dist: pillow
|
|
33
32
|
Requires-Dist: pyarrow
|
|
34
|
-
Requires-Dist: pyyaml
|
|
33
|
+
Requires-Dist: pyyaml>=5.1
|
|
35
34
|
Requires-Dist: requests
|
|
36
35
|
Requires-Dist: rouge-chinese
|
|
37
36
|
Requires-Dist: rouge-score>=0.1.0
|
|
@@ -70,6 +69,7 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
|
70
69
|
Provides-Extra: aigc
|
|
71
70
|
Requires-Dist: diffusers; extra == "aigc"
|
|
72
71
|
Requires-Dist: iopath; extra == "aigc"
|
|
72
|
+
Requires-Dist: omegaconf; extra == "aigc"
|
|
73
73
|
Requires-Dist: open_clip_torch; extra == "aigc"
|
|
74
74
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
75
75
|
Provides-Extra: all
|
|
@@ -83,12 +83,11 @@ Requires-Dist: latex2sympy2; extra == "all"
|
|
|
83
83
|
Requires-Dist: matplotlib; extra == "all"
|
|
84
84
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
85
85
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
86
|
-
Requires-Dist: omegaconf; extra == "all"
|
|
87
86
|
Requires-Dist: openai; extra == "all"
|
|
88
87
|
Requires-Dist: pandas; extra == "all"
|
|
89
88
|
Requires-Dist: pillow; extra == "all"
|
|
90
89
|
Requires-Dist: pyarrow; extra == "all"
|
|
91
|
-
Requires-Dist: pyyaml; extra == "all"
|
|
90
|
+
Requires-Dist: pyyaml>=5.1; extra == "all"
|
|
92
91
|
Requires-Dist: requests; extra == "all"
|
|
93
92
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
94
93
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
@@ -121,6 +120,7 @@ Requires-Dist: gradio==5.4.0; extra == "all"
|
|
|
121
120
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
122
121
|
Requires-Dist: diffusers; extra == "all"
|
|
123
122
|
Requires-Dist: iopath; extra == "all"
|
|
123
|
+
Requires-Dist: omegaconf; extra == "all"
|
|
124
124
|
Requires-Dist: open_clip_torch; extra == "all"
|
|
125
125
|
Requires-Dist: opencv-python; extra == "all"
|
|
126
126
|
|
|
@@ -34,7 +34,7 @@ class GeneralT2IAdapter(T2IBaseAdapter):
|
|
|
34
34
|
subset_list = subset_list or self.subset_list
|
|
35
35
|
|
|
36
36
|
data_file_dict = defaultdict(str)
|
|
37
|
-
|
|
37
|
+
data_item_dict = defaultdict(list)
|
|
38
38
|
|
|
39
39
|
# get data file path and subset name
|
|
40
40
|
if os.path.isdir(dataset_name_or_path):
|
|
@@ -49,10 +49,10 @@ class GeneralT2IAdapter(T2IBaseAdapter):
|
|
|
49
49
|
# load data from local disk
|
|
50
50
|
try:
|
|
51
51
|
for subset_name, file_path in data_file_dict.items():
|
|
52
|
-
|
|
52
|
+
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
53
53
|
except Exception as e:
|
|
54
54
|
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
55
55
|
|
|
56
|
-
data_dict = {subset_name: {'test':
|
|
56
|
+
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
57
57
|
|
|
58
58
|
return data_dict
|
|
@@ -33,7 +33,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
33
33
|
subset_list = subset_list or self.subset_list
|
|
34
34
|
|
|
35
35
|
data_file_dict = defaultdict(str)
|
|
36
|
-
|
|
36
|
+
data_item_dict = defaultdict(list)
|
|
37
37
|
|
|
38
38
|
# get data file path and subset name
|
|
39
39
|
if os.path.isdir(dataset_name_or_path):
|
|
@@ -48,11 +48,11 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
48
48
|
# load data from local disk
|
|
49
49
|
try:
|
|
50
50
|
for subset_name, file_path in data_file_dict.items():
|
|
51
|
-
|
|
51
|
+
data_item_dict[subset_name] = jsonl_to_list(file_path)
|
|
52
52
|
except Exception as e:
|
|
53
53
|
raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
|
|
54
54
|
|
|
55
|
-
data_dict = {subset_name: {'test':
|
|
55
|
+
data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
|
|
56
56
|
|
|
57
57
|
return data_dict
|
|
58
58
|
|
|
@@ -317,6 +317,8 @@ class Evaluator(object):
|
|
|
317
317
|
"""
|
|
318
318
|
|
|
319
319
|
review_res_list = []
|
|
320
|
+
max_choices = max(
|
|
321
|
+
len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
|
|
320
322
|
for review_d in reviews_list:
|
|
321
323
|
if not review_d[ReviewKeys.REVIEWED]:
|
|
322
324
|
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
@@ -325,10 +327,14 @@ class Evaluator(object):
|
|
|
325
327
|
if len(review_d[AnswerKeys.CHOICES]) == 0:
|
|
326
328
|
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
327
329
|
continue
|
|
328
|
-
elif len(review_d[AnswerKeys.CHOICES]) == 1:
|
|
330
|
+
elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
|
|
329
331
|
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
330
332
|
else:
|
|
331
333
|
review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
|
|
334
|
+
if len(review_d[AnswerKeys.CHOICES]) < max_choices:
|
|
335
|
+
logger.warning(
|
|
336
|
+
f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
|
|
337
|
+
f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
|
|
332
338
|
|
|
333
339
|
review_res_list.append(review_res)
|
|
334
340
|
|
|
@@ -100,10 +100,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
100
100
|
if i < len(system_prompts) and system_prompts[i]:
|
|
101
101
|
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
102
102
|
# whether thinking is needed
|
|
103
|
-
|
|
104
|
-
if
|
|
103
|
+
chat_template_kwargs = infer_cfg.get('chat_template_kwargs', None)
|
|
104
|
+
if chat_template_kwargs is not None:
|
|
105
105
|
prompts = self.tokenizer.apply_chat_template(
|
|
106
|
-
messages, tokenize=False, add_generation_prompt=True,
|
|
106
|
+
messages, tokenize=False, add_generation_prompt=True, **chat_template_kwargs)
|
|
107
107
|
else:
|
|
108
108
|
prompts = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
109
109
|
formatted_prompts.append(prompts)
|
|
@@ -9,7 +9,7 @@ import threading
|
|
|
9
9
|
import time
|
|
10
10
|
from http import HTTPStatus
|
|
11
11
|
from tqdm import tqdm
|
|
12
|
-
from typing import AsyncGenerator, List
|
|
12
|
+
from typing import AsyncGenerator, Dict, List, Tuple
|
|
13
13
|
|
|
14
14
|
from evalscope.perf.arguments import Arguments
|
|
15
15
|
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
@@ -180,7 +180,7 @@ async def connect_test(args: Arguments) -> bool:
|
|
|
180
180
|
|
|
181
181
|
|
|
182
182
|
@exception_handler
|
|
183
|
-
async def benchmark(args: Arguments) ->
|
|
183
|
+
async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
184
184
|
if platform.system() != 'Windows':
|
|
185
185
|
loop = asyncio.get_running_loop()
|
|
186
186
|
add_signal_handlers(loop)
|
|
@@ -205,4 +205,5 @@ async def benchmark(args: Arguments) -> None:
|
|
|
205
205
|
data_process_completed_event.set()
|
|
206
206
|
|
|
207
207
|
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
208
|
-
summary_result(args, metrics, result_db_path)
|
|
208
|
+
metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
|
|
209
|
+
return metrics_result, percentile_result
|
|
@@ -36,9 +36,11 @@ def run_perf_benchmark(args):
|
|
|
36
36
|
if platform.system() != 'Windows':
|
|
37
37
|
add_signal_handlers(loop)
|
|
38
38
|
|
|
39
|
-
loop.run_until_complete(benchmark(args))
|
|
39
|
+
return loop.run_until_complete(benchmark(args))
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
if __name__ == '__main__':
|
|
43
43
|
args = Arguments.from_args(parse_args())
|
|
44
|
-
run_perf_benchmark(args)
|
|
44
|
+
metrics_result, percentile_result = run_perf_benchmark(args)
|
|
45
|
+
print(metrics_result)
|
|
46
|
+
print(percentile_result)
|
|
@@ -7,7 +7,7 @@ import sqlite3
|
|
|
7
7
|
import sys
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from tabulate import tabulate
|
|
10
|
-
from typing import Dict, List
|
|
10
|
+
from typing import Dict, List, Tuple
|
|
11
11
|
|
|
12
12
|
from evalscope.perf.arguments import Arguments
|
|
13
13
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
@@ -200,16 +200,16 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
200
200
|
return results
|
|
201
201
|
|
|
202
202
|
|
|
203
|
-
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
|
|
203
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
|
|
204
204
|
result_path = os.path.dirname(result_db_path)
|
|
205
205
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
206
206
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
write_json_file(
|
|
207
|
+
metrics_result = metrics.create_message()
|
|
208
|
+
metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
|
|
209
|
+
write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
|
|
210
210
|
|
|
211
211
|
# Print summary in a table
|
|
212
|
-
table = tabulate(list(
|
|
212
|
+
table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
|
|
213
213
|
logger.info('\nBenchmarking summary:\n' + table)
|
|
214
214
|
|
|
215
215
|
# Get percentile results
|
|
@@ -223,6 +223,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
223
223
|
if args.dataset.startswith('speed_benchmark'):
|
|
224
224
|
speed_benchmark_result(result_db_path)
|
|
225
225
|
|
|
226
|
+
return metrics_result, percentile_result
|
|
227
|
+
|
|
226
228
|
|
|
227
229
|
def speed_benchmark_result(result_db_path: str):
|
|
228
230
|
query_sql = """
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.15.
|
|
3
|
+
Version: 0.15.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -26,12 +26,11 @@ Requires-Dist: latex2sympy2
|
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
29
|
-
Requires-Dist: omegaconf
|
|
30
29
|
Requires-Dist: openai
|
|
31
30
|
Requires-Dist: pandas
|
|
32
31
|
Requires-Dist: pillow
|
|
33
32
|
Requires-Dist: pyarrow
|
|
34
|
-
Requires-Dist: pyyaml
|
|
33
|
+
Requires-Dist: pyyaml>=5.1
|
|
35
34
|
Requires-Dist: requests
|
|
36
35
|
Requires-Dist: rouge-chinese
|
|
37
36
|
Requires-Dist: rouge-score>=0.1.0
|
|
@@ -70,6 +69,7 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
|
70
69
|
Provides-Extra: aigc
|
|
71
70
|
Requires-Dist: diffusers; extra == "aigc"
|
|
72
71
|
Requires-Dist: iopath; extra == "aigc"
|
|
72
|
+
Requires-Dist: omegaconf; extra == "aigc"
|
|
73
73
|
Requires-Dist: open_clip_torch; extra == "aigc"
|
|
74
74
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
75
75
|
Provides-Extra: all
|
|
@@ -83,12 +83,11 @@ Requires-Dist: latex2sympy2; extra == "all"
|
|
|
83
83
|
Requires-Dist: matplotlib; extra == "all"
|
|
84
84
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
85
85
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
86
|
-
Requires-Dist: omegaconf; extra == "all"
|
|
87
86
|
Requires-Dist: openai; extra == "all"
|
|
88
87
|
Requires-Dist: pandas; extra == "all"
|
|
89
88
|
Requires-Dist: pillow; extra == "all"
|
|
90
89
|
Requires-Dist: pyarrow; extra == "all"
|
|
91
|
-
Requires-Dist: pyyaml; extra == "all"
|
|
90
|
+
Requires-Dist: pyyaml>=5.1; extra == "all"
|
|
92
91
|
Requires-Dist: requests; extra == "all"
|
|
93
92
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
94
93
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
@@ -121,6 +120,7 @@ Requires-Dist: gradio==5.4.0; extra == "all"
|
|
|
121
120
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
122
121
|
Requires-Dist: diffusers; extra == "all"
|
|
123
122
|
Requires-Dist: iopath; extra == "all"
|
|
123
|
+
Requires-Dist: omegaconf; extra == "all"
|
|
124
124
|
Requires-Dist: open_clip_torch; extra == "all"
|
|
125
125
|
Requires-Dist: opencv-python; extra == "all"
|
|
126
126
|
|
|
@@ -8,12 +8,11 @@ latex2sympy2
|
|
|
8
8
|
matplotlib
|
|
9
9
|
modelscope[framework]
|
|
10
10
|
nltk>=3.9
|
|
11
|
-
omegaconf
|
|
12
11
|
openai
|
|
13
12
|
pandas
|
|
14
13
|
pillow
|
|
15
14
|
pyarrow
|
|
16
|
-
pyyaml
|
|
15
|
+
pyyaml>=5.1
|
|
17
16
|
requests
|
|
18
17
|
rouge-chinese
|
|
19
18
|
rouge-score>=0.1.0
|
|
@@ -31,6 +30,7 @@ word2number
|
|
|
31
30
|
[aigc]
|
|
32
31
|
diffusers
|
|
33
32
|
iopath
|
|
33
|
+
omegaconf
|
|
34
34
|
open_clip_torch
|
|
35
35
|
opencv-python
|
|
36
36
|
|
|
@@ -45,12 +45,11 @@ latex2sympy2
|
|
|
45
45
|
matplotlib
|
|
46
46
|
modelscope[framework]
|
|
47
47
|
nltk>=3.9
|
|
48
|
-
omegaconf
|
|
49
48
|
openai
|
|
50
49
|
pandas
|
|
51
50
|
pillow
|
|
52
51
|
pyarrow
|
|
53
|
-
pyyaml
|
|
52
|
+
pyyaml>=5.1
|
|
54
53
|
requests
|
|
55
54
|
rouge-chinese
|
|
56
55
|
rouge-score>=0.1.0
|
|
@@ -83,6 +82,7 @@ gradio==5.4.0
|
|
|
83
82
|
plotly<6.0.0,>=5.23.0
|
|
84
83
|
diffusers
|
|
85
84
|
iopath
|
|
85
|
+
omegaconf
|
|
86
86
|
open_clip_torch
|
|
87
87
|
opencv-python
|
|
88
88
|
|
|
@@ -59,9 +59,9 @@ class TestRun(unittest.TestCase):
|
|
|
59
59
|
},
|
|
60
60
|
datasets=[
|
|
61
61
|
'tifa160',
|
|
62
|
-
'genai_bench',
|
|
63
|
-
'evalmuse',
|
|
64
|
-
'hpdv2',
|
|
62
|
+
# 'genai_bench',
|
|
63
|
+
# 'evalmuse',
|
|
64
|
+
# 'hpdv2',
|
|
65
65
|
],
|
|
66
66
|
dataset_args={
|
|
67
67
|
'tifa160': {
|
|
@@ -81,7 +81,7 @@ class TestRun(unittest.TestCase):
|
|
|
81
81
|
'num_inference_steps': 50,
|
|
82
82
|
'guidance_scale': 7.5
|
|
83
83
|
},
|
|
84
|
-
use_cache='outputs/20250427_134122',
|
|
84
|
+
# use_cache='outputs/20250427_134122',
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
run_task(task_cfg=task_cfg)
|
|
@@ -207,13 +207,13 @@ class TestRun(unittest.TestCase):
|
|
|
207
207
|
from evalscope.config import TaskConfig
|
|
208
208
|
|
|
209
209
|
task_cfg = TaskConfig(
|
|
210
|
-
model='Qwen/
|
|
210
|
+
model='Qwen/Qwen3-1.7B',
|
|
211
211
|
datasets=[
|
|
212
212
|
# 'iquiz',
|
|
213
213
|
# 'math_500',
|
|
214
|
-
|
|
214
|
+
'aime24',
|
|
215
215
|
# 'competition_math',
|
|
216
|
-
'mmlu',
|
|
216
|
+
# 'mmlu',
|
|
217
217
|
],
|
|
218
218
|
dataset_args={
|
|
219
219
|
'competition_math': {
|
|
@@ -224,8 +224,15 @@ class TestRun(unittest.TestCase):
|
|
|
224
224
|
'few_shot_num': 0
|
|
225
225
|
},
|
|
226
226
|
},
|
|
227
|
-
limit=
|
|
228
|
-
eval_batch_size=
|
|
227
|
+
limit=5,
|
|
228
|
+
eval_batch_size=5,
|
|
229
|
+
generation_config={
|
|
230
|
+
'max_new_tokens': 1000, # 最大生成token数,建议设置为较大值避免输出截断
|
|
231
|
+
'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
|
|
232
|
+
'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
|
|
233
|
+
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
|
234
|
+
'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
|
|
235
|
+
}
|
|
229
236
|
)
|
|
230
237
|
|
|
231
238
|
run_task(task_cfg=task_cfg)
|
|
@@ -103,7 +103,7 @@ class TestPerf(unittest.TestCase):
|
|
|
103
103
|
from evalscope.perf.arguments import Arguments
|
|
104
104
|
task_cfg = Arguments(
|
|
105
105
|
parallel=20,
|
|
106
|
-
model='
|
|
106
|
+
model='Qwen3-1.7B',
|
|
107
107
|
url='http://127.0.0.1:8801/v1/completions',
|
|
108
108
|
api='openai',
|
|
109
109
|
dataset='random',
|
|
@@ -117,7 +117,9 @@ class TestPerf(unittest.TestCase):
|
|
|
117
117
|
seed=None,
|
|
118
118
|
extra_args={'ignore_eos': True}
|
|
119
119
|
)
|
|
120
|
-
run_perf_benchmark(task_cfg)
|
|
120
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
121
|
+
print(metrics_result)
|
|
122
|
+
print(percentile_result)
|
|
121
123
|
|
|
122
124
|
|
|
123
125
|
if __name__ == '__main__':
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt
RENAMED
|
File without changes
|
{evalscope-0.15.0 → evalscope-0.15.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt
RENAMED
|
File without changes
|