evalscope 0.15.1__tar.gz → 0.16.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.15.1/evalscope.egg-info → evalscope-0.16.1}/PKG-INFO +57 -31
- {evalscope-0.15.1 → evalscope-0.16.1}/README.md +42 -18
- evalscope-0.16.1/evalscope/app/__init__.py +28 -0
- {evalscope-0.15.1/evalscope/report → evalscope-0.16.1/evalscope/app}/app.py +67 -59
- evalscope-0.16.1/evalscope/app/constants.py +21 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/arguments.py +12 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/backend_manager.py +2 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/embedding.py +75 -35
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/llm.py +1 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/benchmark.py +1 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/data_adapter.py +101 -18
- evalscope-0.16.1/evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope-0.16.1/evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope-0.16.1/evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope-0.16.1/evalscope/benchmarks/drop/utils.py +59 -0
- evalscope-0.16.1/evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope-0.16.1/evalscope/benchmarks/frames/utils.py +37 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope-0.16.1/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope-0.16.1/evalscope/benchmarks/needle_haystack/utils.py +79 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.16.1/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- evalscope-0.16.1/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope-0.16.1/evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope-0.16.1/evalscope/benchmarks/utils.py +60 -0
- evalscope-0.16.1/evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/start_app.py +2 -2
- evalscope-0.16.1/evalscope/collections/__init__.py +35 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/collections/evaluator.py +94 -32
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/config.py +54 -17
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/evaluator.py +80 -41
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/__init__.py +3 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/llm_judge.py +15 -8
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/math_parser.py +1 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/rouge_metric.py +11 -13
- evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/chat_adapter.py +51 -34
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/server_adapter.py +17 -25
- evalscope-0.16.1/evalscope/perf/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/arguments.py +16 -7
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/benchmark.py +0 -15
- evalscope-0.16.1/evalscope/perf/main.py +103 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope-0.16.1/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/benchmark_util.py +34 -16
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/db_util.py +25 -15
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/local_server.py +1 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/log_utils.py +12 -5
- evalscope-0.16.1/evalscope/perf/utils/rich_display.py +186 -0
- evalscope-0.16.1/evalscope/report/__init__.py +38 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/report/combinator.py +8 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/report/generator.py +33 -9
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/report/utils.py +61 -4
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/run.py +12 -0
- evalscope-0.16.1/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope-0.16.1/evalscope/utils/deprecation_utils.py +42 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/logger.py +1 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/utils.py +12 -0
- evalscope-0.16.1/evalscope/version.py +4 -0
- {evalscope-0.15.1 → evalscope-0.16.1/evalscope.egg-info}/PKG-INFO +57 -31
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/SOURCES.txt +23 -2
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/requires.txt +14 -12
- {evalscope-0.15.1 → evalscope-0.16.1}/requirements/framework.txt +2 -2
- evalscope-0.16.1/requirements/opencompass.txt +1 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/requirements/perf.txt +2 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/requirements/rag.txt +1 -1
- evalscope-0.16.1/requirements/vlmeval.txt +1 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/aigc/test_t2i.py +40 -3
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/cli/test_all.py +39 -32
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/cli/test_collection.py +8 -6
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/cli/test_run.py +43 -17
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/perf/test_perf.py +23 -0
- evalscope-0.16.1/tests/rag/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/rag/test_mteb.py +5 -5
- evalscope-0.15.1/evalscope/benchmarks/utils.py +0 -34
- evalscope-0.15.1/evalscope/collections/__init__.py +0 -3
- evalscope-0.15.1/evalscope/perf/main.py +0 -46
- evalscope-0.15.1/evalscope/report/__init__.py +0 -6
- evalscope-0.15.1/evalscope/version.py +0 -4
- evalscope-0.15.1/requirements/opencompass.txt +0 -1
- evalscope-0.15.1/requirements/vlmeval.txt +0 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/LICENSE +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/MANIFEST.in +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/__init__.py +0 -0
- evalscope-0.15.1/evalscope/report/app_arguments.py → evalscope-0.16.1/evalscope/app/arguments.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/general_mcq → evalscope-0.16.1/evalscope/benchmarks/docmath}/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/gpqa → evalscope-0.16.1/evalscope/benchmarks/drop}/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/ifeval → evalscope-0.16.1/evalscope/benchmarks/frames}/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/iquiz → evalscope-0.16.1/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/live_code_bench → evalscope-0.16.1/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/maritime_bench → evalscope-0.16.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/math_500 → evalscope-0.16.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/mmlu_pro → evalscope-0.16.1/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/mmlu_redux → evalscope-0.16.1/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/musr → evalscope-0.16.1/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/process_bench → evalscope-0.16.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/simple_qa → evalscope-0.16.1/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/super_gpqa → evalscope-0.16.1/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models → evalscope-0.16.1/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.16.1/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.16.1/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.16.1/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/perf → evalscope-0.16.1/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/collections/schema.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/constants.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.15.1/evalscope/perf/utils → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/third_party/thinkbench/tools → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-0.15.1/tests/rag → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/base_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/choice_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/custom_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/t2i_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/local_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/register.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/run_arena.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/summarizer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/filters.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/import_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/io_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/requirements/aigc.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/requirements/app.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/requirements/docs.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/requirements.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/setup.cfg +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/setup.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/aigc/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/cli/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/perf/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/swift/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/test_run_all.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/vlm/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.1}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.16.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -17,12 +17,12 @@ Requires-Python: >=3.8
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: accelerate
|
|
20
|
-
Requires-Dist: datasets
|
|
20
|
+
Requires-Dist: datasets>=3.0
|
|
21
21
|
Requires-Dist: immutabledict
|
|
22
22
|
Requires-Dist: jieba
|
|
23
23
|
Requires-Dist: jsonlines
|
|
24
24
|
Requires-Dist: langdetect
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: latex2sympy2_extended
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
@@ -45,24 +45,25 @@ Requires-Dist: tqdm
|
|
|
45
45
|
Requires-Dist: transformers>=4.33
|
|
46
46
|
Requires-Dist: word2number
|
|
47
47
|
Provides-Extra: opencompass
|
|
48
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
48
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
49
49
|
Provides-Extra: vlmeval
|
|
50
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
50
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
51
51
|
Provides-Extra: rag
|
|
52
52
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
53
53
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
54
54
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
55
55
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
56
|
-
Requires-Dist: mteb==1.
|
|
56
|
+
Requires-Dist: mteb==1.38.20; extra == "rag"
|
|
57
57
|
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
58
58
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
59
59
|
Provides-Extra: perf
|
|
60
60
|
Requires-Dist: aiohttp; extra == "perf"
|
|
61
61
|
Requires-Dist: fastapi; extra == "perf"
|
|
62
62
|
Requires-Dist: numpy; extra == "perf"
|
|
63
|
+
Requires-Dist: rich; extra == "perf"
|
|
63
64
|
Requires-Dist: sse_starlette; extra == "perf"
|
|
64
65
|
Requires-Dist: transformers; extra == "perf"
|
|
65
|
-
Requires-Dist:
|
|
66
|
+
Requires-Dist: uvicorn; extra == "perf"
|
|
66
67
|
Provides-Extra: app
|
|
67
68
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
68
69
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
@@ -74,12 +75,12 @@ Requires-Dist: open_clip_torch; extra == "aigc"
|
|
|
74
75
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
75
76
|
Provides-Extra: all
|
|
76
77
|
Requires-Dist: accelerate; extra == "all"
|
|
77
|
-
Requires-Dist: datasets
|
|
78
|
+
Requires-Dist: datasets>=3.0; extra == "all"
|
|
78
79
|
Requires-Dist: immutabledict; extra == "all"
|
|
79
80
|
Requires-Dist: jieba; extra == "all"
|
|
80
81
|
Requires-Dist: jsonlines; extra == "all"
|
|
81
82
|
Requires-Dist: langdetect; extra == "all"
|
|
82
|
-
Requires-Dist:
|
|
83
|
+
Requires-Dist: latex2sympy2_extended; extra == "all"
|
|
83
84
|
Requires-Dist: matplotlib; extra == "all"
|
|
84
85
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
85
86
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -101,21 +102,22 @@ Requires-Dist: torchvision; extra == "all"
|
|
|
101
102
|
Requires-Dist: tqdm; extra == "all"
|
|
102
103
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
103
104
|
Requires-Dist: word2number; extra == "all"
|
|
104
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
105
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
105
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
|
|
106
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
|
|
106
107
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
107
108
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
108
109
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
109
110
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
110
|
-
Requires-Dist: mteb==1.
|
|
111
|
+
Requires-Dist: mteb==1.38.20; extra == "all"
|
|
111
112
|
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
112
113
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
113
114
|
Requires-Dist: aiohttp; extra == "all"
|
|
114
115
|
Requires-Dist: fastapi; extra == "all"
|
|
115
116
|
Requires-Dist: numpy; extra == "all"
|
|
117
|
+
Requires-Dist: rich; extra == "all"
|
|
116
118
|
Requires-Dist: sse_starlette; extra == "all"
|
|
117
119
|
Requires-Dist: transformers; extra == "all"
|
|
118
|
-
Requires-Dist:
|
|
120
|
+
Requires-Dist: uvicorn; extra == "all"
|
|
119
121
|
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
120
122
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
121
123
|
Requires-Dist: diffusers; extra == "all"
|
|
@@ -177,9 +179,23 @@ Requires-Dist: opencv-python; extra == "all"
|
|
|
177
179
|
|
|
178
180
|
## 📝 Introduction
|
|
179
181
|
|
|
180
|
-
EvalScope is [ModelScope](https://modelscope.cn/)
|
|
182
|
+
EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
|
|
181
183
|
|
|
182
|
-
|
|
184
|
+
- 🧠 Large Language Models
|
|
185
|
+
- 🎨 Multimodal Models
|
|
186
|
+
- 🔍 Embedding Models
|
|
187
|
+
- 🏆 Reranker Models
|
|
188
|
+
- 🖼️ CLIP Models
|
|
189
|
+
- 🎭 AIGC Models (Image-to-Text/Video)
|
|
190
|
+
- ...and more!
|
|
191
|
+
|
|
192
|
+
EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
|
|
193
|
+
|
|
194
|
+
- 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
|
|
195
|
+
- 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
|
|
196
|
+
- 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
|
|
197
|
+
|
|
198
|
+
Below is the overall architecture diagram of EvalScope:
|
|
183
199
|
|
|
184
200
|
<p align="center">
|
|
185
201
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
@@ -214,6 +230,10 @@ Please scan the QR code below to join our community groups:
|
|
|
214
230
|
|
|
215
231
|
## 🎉 News
|
|
216
232
|
|
|
233
|
+
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
234
|
+
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
235
|
+
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
236
|
+
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
217
237
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
218
238
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
219
239
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
@@ -479,26 +499,27 @@ For more customized evaluations, such as customizing model parameters or dataset
|
|
|
479
499
|
|
|
480
500
|
```shell
|
|
481
501
|
evalscope eval \
|
|
482
|
-
--model Qwen/
|
|
483
|
-
--model-args revision
|
|
484
|
-
--generation-config do_sample
|
|
502
|
+
--model Qwen/Qwen3-0.6B \
|
|
503
|
+
--model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
|
|
504
|
+
--generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
|
|
485
505
|
--dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
|
|
486
506
|
--datasets gsm8k \
|
|
487
507
|
--limit 10
|
|
488
508
|
```
|
|
489
509
|
|
|
490
|
-
### Parameter
|
|
491
|
-
- `--model-args`: Model loading parameters,
|
|
492
|
-
- `revision`: Model version
|
|
493
|
-
- `precision`: Model precision
|
|
494
|
-
- `device_map`:
|
|
495
|
-
- `--generation-config`: Generation parameters,
|
|
496
|
-
- `do_sample`: Whether to use sampling
|
|
497
|
-
- `
|
|
498
|
-
- `max_new_tokens`: Maximum length of
|
|
499
|
-
-
|
|
510
|
+
### Parameter Description
|
|
511
|
+
- `--model-args`: Model loading parameters, passed as a JSON string:
|
|
512
|
+
- `revision`: Model version
|
|
513
|
+
- `precision`: Model precision
|
|
514
|
+
- `device_map`: Device allocation for the model
|
|
515
|
+
- `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
|
|
516
|
+
- `do_sample`: Whether to use sampling
|
|
517
|
+
- `temperature`: Generation temperature
|
|
518
|
+
- `max_new_tokens`: Maximum length of generated tokens
|
|
519
|
+
- `chat_template_kwargs`: Model inference template parameters
|
|
520
|
+
- `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
|
|
500
521
|
- `few_shot_num`: Number of few-shot examples
|
|
501
|
-
- `few_shot_random`: Whether to randomly sample few-shot data
|
|
522
|
+
- `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
|
|
502
523
|
|
|
503
524
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
504
525
|
|
|
@@ -517,6 +538,11 @@ A stress testing tool focused on large language models, which can be customized
|
|
|
517
538
|
|
|
518
539
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
519
540
|
|
|
541
|
+
**Output example**
|
|
542
|
+
|
|
543
|
+

|
|
544
|
+
|
|
545
|
+
|
|
520
546
|
**Supports wandb for recording results**
|
|
521
547
|
|
|
522
548
|

|
|
@@ -565,7 +591,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
565
591
|
</a>
|
|
566
592
|
|
|
567
593
|
## 🔜 Roadmap
|
|
568
|
-
- [
|
|
594
|
+
- [x] Support for better evaluation report visualization
|
|
569
595
|
- [x] Support for mixed evaluations across multiple datasets
|
|
570
596
|
- [x] RAG evaluation
|
|
571
597
|
- [x] VLM evaluation
|
|
@@ -575,7 +601,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
575
601
|
- [x] Multi-modal evaluation
|
|
576
602
|
- [ ] Benchmarks
|
|
577
603
|
- [ ] GAIA
|
|
578
|
-
- [
|
|
604
|
+
- [x] GPQA
|
|
579
605
|
- [x] MBPP
|
|
580
606
|
|
|
581
607
|
|
|
@@ -51,9 +51,23 @@
|
|
|
51
51
|
|
|
52
52
|
## 📝 Introduction
|
|
53
53
|
|
|
54
|
-
EvalScope is [ModelScope](https://modelscope.cn/)
|
|
54
|
+
EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
- 🧠 Large Language Models
|
|
57
|
+
- 🎨 Multimodal Models
|
|
58
|
+
- 🔍 Embedding Models
|
|
59
|
+
- 🏆 Reranker Models
|
|
60
|
+
- 🖼️ CLIP Models
|
|
61
|
+
- 🎭 AIGC Models (Image-to-Text/Video)
|
|
62
|
+
- ...and more!
|
|
63
|
+
|
|
64
|
+
EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
|
|
65
|
+
|
|
66
|
+
- 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
|
|
67
|
+
- 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
|
|
68
|
+
- 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
|
|
69
|
+
|
|
70
|
+
Below is the overall architecture diagram of EvalScope:
|
|
57
71
|
|
|
58
72
|
<p align="center">
|
|
59
73
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
@@ -88,6 +102,10 @@ Please scan the QR code below to join our community groups:
|
|
|
88
102
|
|
|
89
103
|
## 🎉 News
|
|
90
104
|
|
|
105
|
+
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
106
|
+
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
107
|
+
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
108
|
+
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
91
109
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
92
110
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
93
111
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
@@ -353,26 +371,27 @@ For more customized evaluations, such as customizing model parameters or dataset
|
|
|
353
371
|
|
|
354
372
|
```shell
|
|
355
373
|
evalscope eval \
|
|
356
|
-
--model Qwen/
|
|
357
|
-
--model-args revision
|
|
358
|
-
--generation-config do_sample
|
|
374
|
+
--model Qwen/Qwen3-0.6B \
|
|
375
|
+
--model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
|
|
376
|
+
--generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
|
|
359
377
|
--dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
|
|
360
378
|
--datasets gsm8k \
|
|
361
379
|
--limit 10
|
|
362
380
|
```
|
|
363
381
|
|
|
364
|
-
### Parameter
|
|
365
|
-
- `--model-args`: Model loading parameters,
|
|
366
|
-
- `revision`: Model version
|
|
367
|
-
- `precision`: Model precision
|
|
368
|
-
- `device_map`:
|
|
369
|
-
- `--generation-config`: Generation parameters,
|
|
370
|
-
- `do_sample`: Whether to use sampling
|
|
371
|
-
- `
|
|
372
|
-
- `max_new_tokens`: Maximum length of
|
|
373
|
-
-
|
|
382
|
+
### Parameter Description
|
|
383
|
+
- `--model-args`: Model loading parameters, passed as a JSON string:
|
|
384
|
+
- `revision`: Model version
|
|
385
|
+
- `precision`: Model precision
|
|
386
|
+
- `device_map`: Device allocation for the model
|
|
387
|
+
- `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
|
|
388
|
+
- `do_sample`: Whether to use sampling
|
|
389
|
+
- `temperature`: Generation temperature
|
|
390
|
+
- `max_new_tokens`: Maximum length of generated tokens
|
|
391
|
+
- `chat_template_kwargs`: Model inference template parameters
|
|
392
|
+
- `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
|
|
374
393
|
- `few_shot_num`: Number of few-shot examples
|
|
375
|
-
- `few_shot_random`: Whether to randomly sample few-shot data
|
|
394
|
+
- `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
|
|
376
395
|
|
|
377
396
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
378
397
|
|
|
@@ -391,6 +410,11 @@ A stress testing tool focused on large language models, which can be customized
|
|
|
391
410
|
|
|
392
411
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
393
412
|
|
|
413
|
+
**Output example**
|
|
414
|
+
|
|
415
|
+

|
|
416
|
+
|
|
417
|
+
|
|
394
418
|
**Supports wandb for recording results**
|
|
395
419
|
|
|
396
420
|

|
|
@@ -439,7 +463,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
439
463
|
</a>
|
|
440
464
|
|
|
441
465
|
## 🔜 Roadmap
|
|
442
|
-
- [
|
|
466
|
+
- [x] Support for better evaluation report visualization
|
|
443
467
|
- [x] Support for mixed evaluations across multiple datasets
|
|
444
468
|
- [x] RAG evaluation
|
|
445
469
|
- [x] VLM evaluation
|
|
@@ -449,7 +473,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
449
473
|
- [x] Multi-modal evaluation
|
|
450
474
|
- [ ] Benchmarks
|
|
451
475
|
- [ ] GAIA
|
|
452
|
-
- [
|
|
476
|
+
- [x] GPQA
|
|
453
477
|
- [x] MBPP
|
|
454
478
|
|
|
455
479
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .app import create_app
|
|
8
|
+
from .arguments import add_argument
|
|
9
|
+
|
|
10
|
+
else:
|
|
11
|
+
_import_structure = {
|
|
12
|
+
'app': [
|
|
13
|
+
'create_app',
|
|
14
|
+
],
|
|
15
|
+
'arguments': [
|
|
16
|
+
'add_argument',
|
|
17
|
+
],
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
sys.modules[__name__] = _LazyModule(
|
|
23
|
+
__name__,
|
|
24
|
+
globals()['__file__'],
|
|
25
|
+
_import_structure,
|
|
26
|
+
module_spec=__spec__,
|
|
27
|
+
extra_objects={},
|
|
28
|
+
)
|
|
@@ -11,35 +11,15 @@ from dataclasses import dataclass
|
|
|
11
11
|
from typing import Any, List, Union
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import DataCollection
|
|
14
|
-
from evalscope.report import Report, ReportKey,
|
|
14
|
+
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
15
15
|
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
16
16
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
17
17
|
from evalscope.version import __version__
|
|
18
|
+
from .arguments import add_argument
|
|
19
|
+
from .constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, PLOTLY_THEME, REPORT_TOKEN
|
|
18
20
|
|
|
19
21
|
logger = get_logger()
|
|
20
22
|
|
|
21
|
-
PLOTLY_THEME = 'plotly_dark'
|
|
22
|
-
REPORT_TOKEN = '@@'
|
|
23
|
-
MODEL_TOKEN = '::'
|
|
24
|
-
DATASET_TOKEN = ', '
|
|
25
|
-
LATEX_DELIMITERS = [{
|
|
26
|
-
'left': '$$',
|
|
27
|
-
'right': '$$',
|
|
28
|
-
'display': True
|
|
29
|
-
}, {
|
|
30
|
-
'left': '$',
|
|
31
|
-
'right': '$',
|
|
32
|
-
'display': False
|
|
33
|
-
}, {
|
|
34
|
-
'left': '\\(',
|
|
35
|
-
'right': '\\)',
|
|
36
|
-
'display': False
|
|
37
|
-
}, {
|
|
38
|
-
'left': '\\[',
|
|
39
|
-
'right': '\\]',
|
|
40
|
-
'display': True
|
|
41
|
-
}]
|
|
42
|
-
|
|
43
23
|
|
|
44
24
|
def scan_for_report_folders(root_path):
|
|
45
25
|
"""Scan for folders containing reports subdirectories"""
|
|
@@ -185,6 +165,13 @@ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
|
|
|
185
165
|
return df, styler
|
|
186
166
|
|
|
187
167
|
|
|
168
|
+
def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
|
|
169
|
+
for report in report_list:
|
|
170
|
+
if report.dataset_name == dataset_name:
|
|
171
|
+
return report.analysis
|
|
172
|
+
return 'N/A'
|
|
173
|
+
|
|
174
|
+
|
|
188
175
|
def plot_single_dataset_scores(df: pd.DataFrame):
|
|
189
176
|
# TODO: add metric radio and relace category name
|
|
190
177
|
plot = px.bar(
|
|
@@ -223,6 +210,33 @@ def plot_multi_report_radar(df: pd.DataFrame):
|
|
|
223
210
|
return fig
|
|
224
211
|
|
|
225
212
|
|
|
213
|
+
def convert_markdown_image(text):
|
|
214
|
+
if not os.path.isfile(text):
|
|
215
|
+
return text
|
|
216
|
+
# Convert the image path to a markdown image tag
|
|
217
|
+
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
218
|
+
text = os.path.abspath(text)
|
|
219
|
+
image_tag = f''
|
|
220
|
+
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
221
|
+
return image_tag
|
|
222
|
+
return text
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def convert_html_tags(text):
|
|
226
|
+
# match begin label
|
|
227
|
+
text = re.sub(r'<(\w+)>', r'[\1]', text)
|
|
228
|
+
# match end label
|
|
229
|
+
text = re.sub(r'</(\w+)>', r'[/\1]', text)
|
|
230
|
+
return text
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def process_string(string: str, max_length: int = 2048) -> str:
|
|
234
|
+
string = convert_html_tags(string) # for display labels e.g.
|
|
235
|
+
if max_length and len(string) > max_length:
|
|
236
|
+
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
237
|
+
return string
|
|
238
|
+
|
|
239
|
+
|
|
226
240
|
def dict_to_markdown(data) -> str:
|
|
227
241
|
markdown_lines = []
|
|
228
242
|
|
|
@@ -230,55 +244,41 @@ def dict_to_markdown(data) -> str:
|
|
|
230
244
|
bold_key = f'**{key}**'
|
|
231
245
|
|
|
232
246
|
if isinstance(value, list):
|
|
233
|
-
value_str = '\n' + '\n'.join([f'
|
|
247
|
+
value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
|
|
234
248
|
elif isinstance(value, dict):
|
|
235
249
|
value_str = dict_to_markdown(value)
|
|
236
250
|
else:
|
|
237
251
|
value_str = str(value)
|
|
238
252
|
|
|
239
|
-
value_str = process_string(value_str)
|
|
240
|
-
markdown_line = f'{bold_key}
|
|
253
|
+
value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
|
|
254
|
+
markdown_line = f'{bold_key}:\n{value_str}'
|
|
241
255
|
markdown_lines.append(markdown_line)
|
|
242
256
|
|
|
243
257
|
return '\n\n'.join(markdown_lines)
|
|
244
258
|
|
|
245
259
|
|
|
246
|
-
def
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
# match end label
|
|
250
|
-
text = re.sub(r'</(\w+)>', r'[/\1]', text)
|
|
251
|
-
return text
|
|
260
|
+
def process_model_prediction(item: Any, max_length: int = 2048) -> str:
|
|
261
|
+
"""
|
|
262
|
+
Process model prediction output into a formatted string.
|
|
252
263
|
|
|
264
|
+
Args:
|
|
265
|
+
item: The item to process. Can be a string, list, or dictionary.
|
|
266
|
+
max_length: The maximum length of the output string.
|
|
253
267
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
# Convert the image path to a markdown image tag
|
|
258
|
-
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
259
|
-
text = os.path.abspath(text)
|
|
260
|
-
image_tag = f''
|
|
261
|
-
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
262
|
-
return image_tag
|
|
263
|
-
return text
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
def process_string(string: str, max_length: int = 2048) -> str:
|
|
267
|
-
string = convert_html_tags(string) # for display labels e.g. `<think>`
|
|
268
|
-
if len(string) > max_length:
|
|
269
|
-
return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
|
|
270
|
-
return string
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
def process_model_prediction(item: Any):
|
|
268
|
+
Returns:
|
|
269
|
+
A formatted string representation of the input.
|
|
270
|
+
"""
|
|
274
271
|
if isinstance(item, dict):
|
|
275
|
-
|
|
276
|
-
return process_string(res)
|
|
272
|
+
result = dict_to_markdown(item)
|
|
277
273
|
elif isinstance(item, list):
|
|
278
|
-
|
|
279
|
-
return process_string(res)
|
|
274
|
+
result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
|
|
280
275
|
else:
|
|
281
|
-
|
|
276
|
+
result = str(item)
|
|
277
|
+
|
|
278
|
+
# Apply HTML tag conversion and truncation only at the final output
|
|
279
|
+
if max_length is not None:
|
|
280
|
+
return process_string(result, max_length)
|
|
281
|
+
return result
|
|
282
282
|
|
|
283
283
|
|
|
284
284
|
def normalize_score(score):
|
|
@@ -443,6 +443,10 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
443
443
|
'zh': '数据集分数',
|
|
444
444
|
'en': 'Dataset Scores'
|
|
445
445
|
},
|
|
446
|
+
'report_analysis': {
|
|
447
|
+
'zh': '报告智能分析',
|
|
448
|
+
'en': 'Report Intelligent Analysis'
|
|
449
|
+
},
|
|
446
450
|
'dataset_scores_table': {
|
|
447
451
|
'zh': '数据集分数表',
|
|
448
452
|
'en': 'Dataset Scores Table'
|
|
@@ -498,6 +502,9 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
498
502
|
with gr.Tab(locale_dict['dataset_details'][lang]):
|
|
499
503
|
dataset_radio = gr.Radio(
|
|
500
504
|
label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
|
|
505
|
+
# show dataset details
|
|
506
|
+
with gr.Accordion(locale_dict['report_analysis'][lang], open=True):
|
|
507
|
+
report_analysis = gr.Markdown(value='N/A', show_copy_button=True)
|
|
501
508
|
gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
|
|
502
509
|
dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
|
|
503
510
|
gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
|
|
@@ -573,15 +580,16 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
573
580
|
@gr.on(
|
|
574
581
|
triggers=[dataset_radio.change, report_list.change],
|
|
575
582
|
inputs=[dataset_radio, report_list],
|
|
576
|
-
outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
|
|
583
|
+
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
|
|
577
584
|
def update_single_report_dataset(dataset_name, report_list):
|
|
578
585
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
579
586
|
report_df = get_data_frame(report_list)
|
|
587
|
+
analysis = get_report_analysis(report_list, dataset_name)
|
|
580
588
|
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
|
|
581
589
|
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
582
590
|
subsets = data_score_df[ReportKey.subset_name].unique().tolist()
|
|
583
591
|
logger.debug(f'subsets: {subsets}')
|
|
584
|
-
return data_score_plot, styler, gr.update(choices=subsets, value=None), None
|
|
592
|
+
return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
|
|
585
593
|
|
|
586
594
|
@gr.on(
|
|
587
595
|
triggers=[subset_select.change],
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
PLOTLY_THEME = 'plotly_dark'
|
|
2
|
+
REPORT_TOKEN = '@@'
|
|
3
|
+
MODEL_TOKEN = '::'
|
|
4
|
+
DATASET_TOKEN = ', '
|
|
5
|
+
LATEX_DELIMITERS = [{
|
|
6
|
+
'left': '$$',
|
|
7
|
+
'right': '$$',
|
|
8
|
+
'display': True
|
|
9
|
+
}, {
|
|
10
|
+
'left': '$',
|
|
11
|
+
'right': '$',
|
|
12
|
+
'display': False
|
|
13
|
+
}, {
|
|
14
|
+
'left': '\\(',
|
|
15
|
+
'right': '\\)',
|
|
16
|
+
'display': False
|
|
17
|
+
}, {
|
|
18
|
+
'left': '\\[',
|
|
19
|
+
'right': '\\]',
|
|
20
|
+
'display': True
|
|
21
|
+
}]
|
|
@@ -9,6 +9,15 @@ class ParseStrArgsAction(argparse.Action):
|
|
|
9
9
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
10
10
|
assert isinstance(values, str), 'args should be a string.'
|
|
11
11
|
|
|
12
|
+
# try json load first
|
|
13
|
+
try:
|
|
14
|
+
arg_dict = json.loads(values)
|
|
15
|
+
setattr(namespace, self.dest, arg_dict)
|
|
16
|
+
return
|
|
17
|
+
except (json.JSONDecodeError, ValueError):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
# If JSON load fails, fall back to parsing as key=value pairs
|
|
12
21
|
arg_dict = {}
|
|
13
22
|
for arg in values.strip().split(','):
|
|
14
23
|
key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
|
|
@@ -58,7 +67,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
58
67
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
59
68
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
60
69
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
61
|
-
parser.add_argument('--limit', type=
|
|
70
|
+
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
62
71
|
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
63
72
|
|
|
64
73
|
# Cache and working directory arguments
|
|
@@ -67,6 +76,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
67
76
|
parser.add_argument('--work-dir', type=str, help='The root cache dir.')
|
|
68
77
|
|
|
69
78
|
# Debug and runtime mode arguments
|
|
79
|
+
parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
|
|
70
80
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
71
81
|
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
72
82
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
@@ -79,6 +89,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
79
89
|
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
80
90
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
81
91
|
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
92
|
+
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
82
93
|
# yapf: enable
|
|
83
94
|
|
|
84
95
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
2
3
|
import subprocess
|
|
3
4
|
import tempfile
|
|
4
5
|
from dataclasses import asdict
|
|
@@ -204,7 +205,7 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
204
205
|
model_d['meta_template'] = get_template(model_d['meta_template'])
|
|
205
206
|
|
|
206
207
|
# set the 'abbr' as the 'path' if 'abbr' is not specified
|
|
207
|
-
model_d['abbr'] = model_d['path']
|
|
208
|
+
model_d['abbr'] = os.path.basename(model_d['path'])
|
|
208
209
|
|
|
209
210
|
model_config = ApiModelConfig(**model_d)
|
|
210
211
|
models.append(asdict(model_config))
|
|
@@ -11,7 +11,9 @@ class ModelArguments:
|
|
|
11
11
|
pooling_mode: Optional[str] = None
|
|
12
12
|
max_seq_length: int = 512 # max sequence length
|
|
13
13
|
# prompt for llm based model
|
|
14
|
-
prompt: str =
|
|
14
|
+
prompt: Optional[str] = None
|
|
15
|
+
# prompts dictionary for different tasks, if prompt is not set
|
|
16
|
+
prompts: Optional[Dict[str, str]] = None
|
|
15
17
|
# model kwargs
|
|
16
18
|
model_kwargs: dict = field(default_factory=dict)
|
|
17
19
|
# config kwargs
|
|
@@ -33,6 +35,7 @@ class ModelArguments:
|
|
|
33
35
|
'pooling_mode': self.pooling_mode,
|
|
34
36
|
'max_seq_length': self.max_seq_length,
|
|
35
37
|
'prompt': self.prompt,
|
|
38
|
+
'prompts': self.prompts,
|
|
36
39
|
'model_kwargs': self.model_kwargs,
|
|
37
40
|
'config_kwargs': self.config_kwargs,
|
|
38
41
|
'encode_kwargs': self.encode_kwargs,
|