evalscope 1.0.1__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evalscope-1.0.1/evalscope.egg-info → evalscope-1.0.2}/PKG-INFO +2 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/README.md +1 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/default_data_adapter.py +6 -4
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/benchmark.py +27 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/meta.py +3 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/evaluator/evaluator.py +5 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/evaluator/state.py +5 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/messages/chat_message.py +6 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/mixin/__init__.py +1 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope-1.0.2/evalscope/api/mixin/sandbox_mixin.py +204 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/model/generate_config.py +0 -3
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/model/model.py +1 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/tool/tool_info.py +1 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/arguments.py +6 -0
- evalscope-1.0.2/evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope-1.0.2/evalscope/benchmarks/amc/amc_adapter.py +46 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bfcl/bfcl_adapter.py +141 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bfcl/generation.py +7 -7
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope-1.0.2/evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope-1.0.2/evalscope/benchmarks/healthbench/utils.py +102 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope-1.0.2/evalscope/benchmarks/humaneval/utils.py +235 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope-1.0.2/evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope-1.0.2/evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope-1.0.2/evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope-1.0.2/evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope-1.0.2/evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope-1.0.2/evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope-1.0.2/evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope-1.0.2/evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope-1.0.2/evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope-1.0.2/evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope-1.0.2/evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-1.0.2/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/config.py +24 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/constants.py +3 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/evaluator/evaluator.py +25 -7
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/metric.py +27 -2
- evalscope-1.0.2/evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope-1.0.2/evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope-1.0.2/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope-1.0.2/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope-1.0.2/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope-1.0.2/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/models/model_apis.py +10 -8
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/models/utils/openai.py +1 -2
- evalscope-1.0.2/evalscope/perf/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/arguments.py +2 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/api/base.py +2 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/api/default_api.py +7 -7
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/api/openai_api.py +83 -19
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope-1.0.2/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/utils/benchmark_util.py +1 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/report/combinator.py +0 -25
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/report/report.py +8 -4
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/run.py +1 -1
- evalscope-1.0.2/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope-1.0.2/evalscope/utils/function_utils.py +70 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/import_utils.py +63 -13
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/io_utils.py +19 -11
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/json_schema.py +23 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/logger.py +19 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/model_utils.py +1 -1
- evalscope-1.0.2/evalscope/version.py +4 -0
- {evalscope-1.0.1 → evalscope-1.0.2/evalscope.egg-info}/PKG-INFO +2 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope.egg-info/SOURCES.txt +28 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope.egg-info/requires.txt +4 -9
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/dev.txt +0 -2
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/framework.txt +1 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/perf.txt +1 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/setup.py +0 -1
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/benchmark/test_eval.py +51 -7
- evalscope-1.0.2/tests/benchmark/test_sandbox.py +81 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/benchmark/test_vlm.py +60 -3
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/perf/test_perf.py +40 -12
- evalscope-1.0.2/tests/rag/__init__.py +0 -0
- evalscope-1.0.1/evalscope/utils/function_utils.py +0 -29
- evalscope-1.0.1/evalscope/version.py +0 -4
- {evalscope-1.0.1 → evalscope-1.0.2}/LICENSE +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/MANIFEST.in +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/image_edit_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/dataset/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/dataset/dataset.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/dataset/loader.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/dataset/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/evaluator/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/evaluator/cache.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/filter/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/filter/filter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/messages/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/messages/content.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/messages/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/metric/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/metric/metric.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/metric/scorer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/model/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/model/model_output.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/registry.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/tool/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/tool/tool_call.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/tool/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/app.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/arguments.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/constants.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/ui/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/ui/app_ui.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/ui/multi_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/ui/sidebar.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/ui/single_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/ui/visualization.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/utils/data_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/utils/env_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/utils/localization.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/utils/text_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/app/utils/visualization.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/base.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/aime → evalscope-1.0.2/evalscope/benchmarks/ai2d}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/alpaca_eval → evalscope-1.0.2/evalscope/benchmarks/aime}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/arena_hard → evalscope-1.0.2/evalscope/benchmarks/alpaca_eval}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/bfcl → evalscope-1.0.2/evalscope/benchmarks/amc}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/chinese_simple_qa → evalscope-1.0.2/evalscope/benchmarks/arena_hard}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/arena_hard/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/data_collection → evalscope-1.0.2/evalscope/benchmarks/bfcl}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/docmath → evalscope-1.0.2/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/drop → evalscope-1.0.2/evalscope/benchmarks/data_collection}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/frames → evalscope-1.0.2/evalscope/benchmarks/docmath}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/docmath/docmath_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/docmath/utils.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/general_arena → evalscope-1.0.2/evalscope/benchmarks/drop}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/drop/utils.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/general_mcq → evalscope-1.0.2/evalscope/benchmarks/frames}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/frames/frames_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/frames/utils.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/gpqa → evalscope-1.0.2/evalscope/benchmarks/general_arena}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/general_arena/general_arena_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/general_arena/utils.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/hle → evalscope-1.0.2/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/ifeval → evalscope-1.0.2/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/gpqa/prompt.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/image_edit → evalscope-1.0.2/evalscope/benchmarks/healthbench}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/image_edit/gedit → evalscope-1.0.2/evalscope/benchmarks/hle}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/hle/hle_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/iquiz → evalscope-1.0.2/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/live_code_bench → evalscope-1.0.2/evalscope/benchmarks/image_edit}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/maritime_bench → evalscope-1.0.2/evalscope/benchmarks/image_edit/gedit}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/image_edit/gedit/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/image_edit/gedit/vie_prompts.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/math_500 → evalscope-1.0.2/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/math_vista → evalscope-1.0.2/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/mmlu_pro → evalscope-1.0.2/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/mmlu_redux → evalscope-1.0.2/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/mmmu → evalscope-1.0.2/evalscope/benchmarks/math_vista}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/math_vista/math_vista_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/mmmu_pro → evalscope-1.0.2/evalscope/benchmarks/minerva_math}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/musr → evalscope-1.0.2/evalscope/benchmarks/mm_bench}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/needle_haystack → evalscope-1.0.2/evalscope/benchmarks/mm_star}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/process_bench → evalscope-1.0.2/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/simple_qa → evalscope-1.0.2/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/super_gpqa → evalscope-1.0.2/evalscope/benchmarks/mmmu}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/mmmu/mmmu_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/tau_bench → evalscope-1.0.2/evalscope/benchmarks/mmmu_pro}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/text2image → evalscope-1.0.2/evalscope/benchmarks/multi_if}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/tool_bench → evalscope-1.0.2/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/benchmarks/winogrande → evalscope-1.0.2/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
- {evalscope-1.0.1/evalscope/metrics/t2v_metrics → evalscope-1.0.2/evalscope/benchmarks/olympiad_bench}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models → evalscope-1.0.2/evalscope/benchmarks/omni_bench}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-1.0.2/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-1.0.2/evalscope/benchmarks/real_world_qa}/__init__.py +0 -0
- {evalscope-1.0.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-1.0.2/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/super_gpqa/prompt.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-1.0.1/evalscope/perf → evalscope-1.0.2/evalscope/benchmarks/tau_bench}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/tau_bench/generation.py +0 -0
- {evalscope-1.0.1/evalscope/perf/utils → evalscope-1.0.2/evalscope/benchmarks/text2image}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/text2image/evalmuse_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/text2image/genai_bench_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/text2image/general_t2i_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/text2image/hpdv2_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/text2image/tifa_adapter.py +0 -0
- {evalscope-1.0.1/evalscope/third_party/thinkbench/tools → evalscope-1.0.2/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/tool_bench/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-1.0.1/tests/rag → evalscope-1.0.2/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/benchmarks/winogrande/winogrande_adapter.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/cli/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/cli/base.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/cli/cli.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/cli/start_app.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/cli/start_eval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/cli/start_perf.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/cli/start_server.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/collections/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/collections/sampler.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/collections/schema.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/filters/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/filters/extraction.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/filters/selection.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/llm_judge.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/math_parser.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/metrics.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/models/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/models/image_edit_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/models/mockllm.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/models/modelscope.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/models/openai_compatible.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/models/text2image_model.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/benchmark.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/http_client.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/main.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/utils/log_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/perf/utils/rich_display.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/report/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/report/generator.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/summarizer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/argument_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/chat_service.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/deprecation_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/multi_choices.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope/utils/url_utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/aigc.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/app.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/docs.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/opencompass.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/rag.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements/vlmeval.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/requirements.txt +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/setup.cfg +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/benchmark/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/benchmark/test_image_edit.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/benchmark/test_t2i.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/cli/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/cli/test_all.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/cli/test_collection.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/cli/test_custom.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/cli/test_reasoning.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/common.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/perf/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/rag/test_mteb.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/rag/test_ragas.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/swift/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/test_run_all.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/utils.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/vlm/__init__.py +0 -0
- {evalscope-1.0.1 → evalscope-1.0.2}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -146,6 +146,7 @@ Please scan the QR code below to join our community groups:
|
|
|
146
146
|
>
|
|
147
147
|
> Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
|
|
148
148
|
|
|
149
|
+
- 🔥 **[2025.09.19]** Added support for multimodal image-text evaluation benchmarks including RealWorldQA, AI2D, MMStar, MMBench, and OmniBench, as well as pure text evaluation benchmarks such as Multi-IF, HealthBench, and AMC.
|
|
149
150
|
- 🔥 **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
|
|
150
151
|
- 🔥 **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
|
|
151
152
|
- 🔥 **[2025.08.22]** Version 1.0 Refactoring. Break changes, please [refer to](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#switching-to-version-v1-0).
|
|
@@ -117,6 +117,7 @@ Please scan the QR code below to join our community groups:
|
|
|
117
117
|
>
|
|
118
118
|
> Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
|
|
119
119
|
|
|
120
|
+
- 🔥 **[2025.09.19]** Added support for multimodal image-text evaluation benchmarks including RealWorldQA, AI2D, MMStar, MMBench, and OmniBench, as well as pure text evaluation benchmarks such as Multi-IF, HealthBench, and AMC.
|
|
120
121
|
- 🔥 **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
|
|
121
122
|
- 🔥 **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
|
|
122
123
|
- 🔥 **[2025.08.22]** Version 1.0 Refactoring. Break changes, please [refer to](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#switching-to-version-v1-0).
|
{evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/default_data_adapter.py
RENAMED
|
@@ -642,9 +642,7 @@ class DefaultDataAdapter(DataAdapter):
|
|
|
642
642
|
"""
|
|
643
643
|
pass
|
|
644
644
|
|
|
645
|
-
def _on_generate_report(
|
|
646
|
-
self, scores: Dict[str, List[AggScore]], model_name: str, add_aggregation_name: bool = True
|
|
647
|
-
) -> Report:
|
|
645
|
+
def _on_generate_report(self, scores: Dict[str, List[AggScore]], model_name: str) -> Report:
|
|
648
646
|
"""
|
|
649
647
|
Hook method called during report generation.
|
|
650
648
|
|
|
@@ -660,7 +658,7 @@ class DefaultDataAdapter(DataAdapter):
|
|
|
660
658
|
Report: The generated evaluation report
|
|
661
659
|
"""
|
|
662
660
|
return ReportGenerator.generate_report(
|
|
663
|
-
score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=add_aggregation_name
|
|
661
|
+
score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=self.add_aggregation_name
|
|
664
662
|
)
|
|
665
663
|
|
|
666
664
|
@override
|
|
@@ -682,3 +680,7 @@ class DefaultDataAdapter(DataAdapter):
|
|
|
682
680
|
report = self._on_generate_report(scores, model_name=model_name)
|
|
683
681
|
self._on_generate_report_end(report, output_dir, **kwargs)
|
|
684
682
|
return report
|
|
683
|
+
|
|
684
|
+
def finalize(self, *args, **kwargs):
|
|
685
|
+
# Finalize the evaluation process
|
|
686
|
+
self.sandbox_finalize(*args, **kwargs)
|
{evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/multi_choice_adapter.py
RENAMED
|
@@ -18,8 +18,11 @@ class MultiChoiceAdapter(DefaultDataAdapter):
|
|
|
18
18
|
This adapter formats the input for multi-choice questions and handles few-shot examples.
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
super().__init__(**kwargs)
|
|
23
|
+
|
|
24
|
+
self.multiple_correct: bool = False
|
|
25
|
+
"""Whether the benchmark allows multiple correct answers."""
|
|
23
26
|
|
|
24
27
|
def format_prompt_template(self, sample: Sample) -> str:
|
|
25
28
|
"""
|
|
@@ -19,6 +19,11 @@ logger = get_logger()
|
|
|
19
19
|
class Text2ImageAdapter(DefaultDataAdapter):
|
|
20
20
|
"""Text to Image Adapter for benchmarks."""
|
|
21
21
|
|
|
22
|
+
def __init__(self, **kwargs):
|
|
23
|
+
super().__init__(**kwargs)
|
|
24
|
+
|
|
25
|
+
self.add_aggregation_name = False # Do not add aggregation name in the report by default
|
|
26
|
+
|
|
22
27
|
def load_from_disk(self, **kwargs):
|
|
23
28
|
return super().load_from_disk(use_local_loader=True)
|
|
24
29
|
|
|
@@ -150,7 +155,3 @@ class Text2ImageAdapter(DefaultDataAdapter):
|
|
|
150
155
|
score.metadata[metric_name] = f'error: {str(e)}'
|
|
151
156
|
|
|
152
157
|
return score
|
|
153
|
-
|
|
154
|
-
def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
|
|
155
|
-
# Don't add aggregation name for needle haystack adapter
|
|
156
|
-
return super()._on_generate_report(scores, model_name, False)
|
{evalscope-1.0.1 → evalscope-1.0.2}/evalscope/api/benchmark/adapters/vision_language_adapter.py
RENAMED
|
@@ -3,4 +3,6 @@ from .default_data_adapter import DefaultDataAdapter
|
|
|
3
3
|
|
|
4
4
|
class VisionLanguageAdapter(DefaultDataAdapter):
|
|
5
5
|
"""Adapter for vision-language benchmarks. e.g., image captioning, visual question answering, etc."""
|
|
6
|
-
|
|
6
|
+
|
|
7
|
+
def __init__(self, **kwargs):
|
|
8
|
+
super().__init__(**kwargs)
|
|
@@ -9,7 +9,7 @@ from evalscope.api.dataset import DatasetDict, Sample
|
|
|
9
9
|
from evalscope.api.evaluator import TaskState
|
|
10
10
|
from evalscope.api.filter import FilterEnsemble, build_filter_ensemble
|
|
11
11
|
from evalscope.api.metric import AggScore, SampleScore
|
|
12
|
-
from evalscope.api.mixin import LLMJudgeMixin
|
|
12
|
+
from evalscope.api.mixin import LLMJudgeMixin, SandboxMixin
|
|
13
13
|
from evalscope.api.model import Model
|
|
14
14
|
from evalscope.report import Report
|
|
15
15
|
from evalscope.utils.logger import get_logger
|
|
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
|
|
|
21
21
|
logger = get_logger()
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class DataAdapter(LLMJudgeMixin, ABC):
|
|
24
|
+
class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
|
|
25
25
|
"""
|
|
26
26
|
Data Adapter for the benchmark.
|
|
27
27
|
"""
|
|
@@ -43,6 +43,12 @@ class DataAdapter(LLMJudgeMixin, ABC):
|
|
|
43
43
|
self.save_metadata = True
|
|
44
44
|
"""Whether to save metadata in the review result"""
|
|
45
45
|
|
|
46
|
+
self.add_aggregation_name = True
|
|
47
|
+
"""Whether to add aggregation name in the report"""
|
|
48
|
+
|
|
49
|
+
self.add_overall_metric = True
|
|
50
|
+
"""Whether to add overall metric in the report"""
|
|
51
|
+
|
|
46
52
|
self.category_map = {}
|
|
47
53
|
"""Category map for the benchmark"""
|
|
48
54
|
|
|
@@ -86,6 +92,11 @@ class DataAdapter(LLMJudgeMixin, ABC):
|
|
|
86
92
|
"""
|
|
87
93
|
pass
|
|
88
94
|
|
|
95
|
+
@abstractmethod
|
|
96
|
+
def finalize(self, *args, **kwargs) -> None:
|
|
97
|
+
"""Finalize the evaluation process."""
|
|
98
|
+
pass
|
|
99
|
+
|
|
89
100
|
@property
|
|
90
101
|
def name(self) -> str:
|
|
91
102
|
"""
|
|
@@ -334,6 +345,20 @@ class DataAdapter(LLMJudgeMixin, ABC):
|
|
|
334
345
|
"""
|
|
335
346
|
self._benchmark_meta.shuffle_choices = value
|
|
336
347
|
|
|
348
|
+
@property
|
|
349
|
+
def review_timeout(self) -> Optional[float]:
|
|
350
|
+
"""
|
|
351
|
+
Return the timeout for the review process.
|
|
352
|
+
"""
|
|
353
|
+
return self._benchmark_meta.review_timeout
|
|
354
|
+
|
|
355
|
+
@review_timeout.setter
|
|
356
|
+
def review_timeout(self, value: float):
|
|
357
|
+
"""
|
|
358
|
+
Set the timeout for the review process.
|
|
359
|
+
"""
|
|
360
|
+
self._benchmark_meta.review_timeout = value
|
|
361
|
+
|
|
337
362
|
@contextlib.contextmanager
|
|
338
363
|
def _temporary_attribute(self, attr_name: str, new_value):
|
|
339
364
|
"""
|
|
@@ -79,6 +79,9 @@ class BenchmarkMeta:
|
|
|
79
79
|
shuffle_choices: bool = False
|
|
80
80
|
"""Whether to shuffle the choices in multiple-choice datasets."""
|
|
81
81
|
|
|
82
|
+
review_timeout: Optional[float] = None
|
|
83
|
+
""" Timeout for review in seconds."""
|
|
84
|
+
|
|
82
85
|
extra_params: Dict = field(default_factory=dict)
|
|
83
86
|
""" Additional parameters for the benchmark."""
|
|
84
87
|
|
|
@@ -273,3 +273,8 @@ class TaskState:
|
|
|
273
273
|
def target(self) -> str:
|
|
274
274
|
"""The scoring target for this `Sample`."""
|
|
275
275
|
return self._target.text
|
|
276
|
+
|
|
277
|
+
@target.setter
|
|
278
|
+
def target(self, text: str) -> None:
|
|
279
|
+
"""Set the target for review purposes."""
|
|
280
|
+
self._target = Target(text)
|
|
@@ -3,7 +3,7 @@ from pydantic import BaseModel, Field, JsonValue, model_validator
|
|
|
3
3
|
from typing import Any, Dict, List, Literal, Optional, Type, Union
|
|
4
4
|
|
|
5
5
|
from evalscope.api.tool import ToolCall, ToolCallError
|
|
6
|
-
from .content import Content, ContentImage, ContentReasoning, ContentText
|
|
6
|
+
from .content import Content, ContentAudio, ContentImage, ContentReasoning, ContentText
|
|
7
7
|
from .utils import parse_content_with_reasoning
|
|
8
8
|
|
|
9
9
|
|
|
@@ -225,6 +225,11 @@ def messages_to_markdown(messages: List[ChatMessage], max_length: Optional[int]
|
|
|
225
225
|
if max_length and len(image_base64) > max_length:
|
|
226
226
|
image_base64 = image_base64[:max_length]
|
|
227
227
|
content_parts.append(f'')
|
|
228
|
+
elif isinstance(content_item, ContentAudio):
|
|
229
|
+
audio_base64 = content_item.audio
|
|
230
|
+
if max_length and len(audio_base64) > max_length:
|
|
231
|
+
audio_base64 = audio_base64[:max_length]
|
|
232
|
+
content_parts.append(f"<audio controls src='{audio_base64}'></audio>")
|
|
228
233
|
elif isinstance(content_item, ContentReasoning):
|
|
229
234
|
content_parts.append(f'**Reasoning:** {content_item.reasoning}')
|
|
230
235
|
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import threading
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from ms_enclave.sandbox.manager import SandboxManager
|
|
9
|
+
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SandboxMixin:
|
|
16
|
+
"""Sandbox mixin for sandboxed code execution."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, task_config: 'TaskConfig'):
|
|
19
|
+
self._task_config = task_config
|
|
20
|
+
|
|
21
|
+
self._manager: Optional['SandboxManager'] = None
|
|
22
|
+
"""Sandbox manager instance."""
|
|
23
|
+
|
|
24
|
+
self._sandbox_id: Optional[str] = None
|
|
25
|
+
"""Sandbox ID."""
|
|
26
|
+
|
|
27
|
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
28
|
+
"""Event loop for async operations."""
|
|
29
|
+
|
|
30
|
+
# Initialize sandbox synchronously by running async methods
|
|
31
|
+
if self.use_sandbox:
|
|
32
|
+
self._loop = asyncio.new_event_loop()
|
|
33
|
+
|
|
34
|
+
# Start the loop in a separate thread
|
|
35
|
+
def run_loop():
|
|
36
|
+
asyncio.set_event_loop(self._loop)
|
|
37
|
+
self._loop.run_forever()
|
|
38
|
+
|
|
39
|
+
self._loop_thread = threading.Thread(target=run_loop, daemon=True)
|
|
40
|
+
self._loop_thread.start()
|
|
41
|
+
|
|
42
|
+
# Wait for initialization
|
|
43
|
+
future = asyncio.run_coroutine_threadsafe(self._async_init(), self._loop)
|
|
44
|
+
future.result()
|
|
45
|
+
|
|
46
|
+
super().__init__()
|
|
47
|
+
|
|
48
|
+
async def _async_init(self):
|
|
49
|
+
"""Async initialization helper."""
|
|
50
|
+
await self.init_sandbox_manager_async()
|
|
51
|
+
await self.init_sandbox_async()
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def use_sandbox(self) -> bool:
|
|
55
|
+
"""
|
|
56
|
+
Return whether to use sandbox for the benchmark.
|
|
57
|
+
"""
|
|
58
|
+
if not self._task_config:
|
|
59
|
+
return False
|
|
60
|
+
else:
|
|
61
|
+
return self._task_config.use_sandbox
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def sandbox_manager(self) -> Optional['SandboxManager']:
|
|
65
|
+
"""Get the sandbox manager instance."""
|
|
66
|
+
return self._manager
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def sandbox_id(self) -> Optional[str]:
|
|
70
|
+
"""Get the sandbox ID."""
|
|
71
|
+
return self._sandbox_id
|
|
72
|
+
|
|
73
|
+
async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
|
|
74
|
+
"""Initialize the sandbox manager asynchronously."""
|
|
75
|
+
if self._manager is not None:
|
|
76
|
+
return self._manager
|
|
77
|
+
|
|
78
|
+
if not self.use_sandbox:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
from ms_enclave.sandbox.manager import HttpSandboxManager, LocalSandboxManager
|
|
82
|
+
|
|
83
|
+
manager_config = self._task_config.sandbox_manager_config or {}
|
|
84
|
+
if manager_config.get('base_url'):
|
|
85
|
+
# Remote manager
|
|
86
|
+
self._manager = HttpSandboxManager(**manager_config)
|
|
87
|
+
else:
|
|
88
|
+
# Local manager
|
|
89
|
+
self._manager = LocalSandboxManager(**manager_config)
|
|
90
|
+
|
|
91
|
+
await self._manager.start()
|
|
92
|
+
logger.info('Sandbox manager initialized.')
|
|
93
|
+
return self._manager
|
|
94
|
+
|
|
95
|
+
def init_sandbox_manager(self) -> Optional['SandboxManager']:
|
|
96
|
+
"""Initialize the sandbox manager."""
|
|
97
|
+
if self._manager is not None:
|
|
98
|
+
return self._manager
|
|
99
|
+
|
|
100
|
+
if not self.use_sandbox:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
# Use the dedicated loop if available
|
|
104
|
+
if self._loop and not self._loop.is_closed():
|
|
105
|
+
future = asyncio.run_coroutine_threadsafe(self.init_sandbox_manager_async(), self._loop)
|
|
106
|
+
return future.result()
|
|
107
|
+
else:
|
|
108
|
+
# Fallback for cases where no loop is available
|
|
109
|
+
return asyncio.run(self.init_sandbox_manager_async())
|
|
110
|
+
|
|
111
|
+
async def init_sandbox_async(self) -> Optional[str]:
|
|
112
|
+
"""Initialize the sandbox instance asynchronously."""
|
|
113
|
+
if self._sandbox_id is not None:
|
|
114
|
+
return self._sandbox_id
|
|
115
|
+
|
|
116
|
+
if not self.use_sandbox:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
from ms_enclave.sandbox.model import DockerSandboxConfig, SandboxType
|
|
120
|
+
|
|
121
|
+
sandbox_config = self._task_config.sandbox_config or DockerSandboxConfig(
|
|
122
|
+
image='python:3.11-slim', tools_config={
|
|
123
|
+
'shell_executor': {},
|
|
124
|
+
'python_executor': {}
|
|
125
|
+
}
|
|
126
|
+
)
|
|
127
|
+
sandbox_type = self._task_config.sandbox_type or SandboxType.DOCKER
|
|
128
|
+
|
|
129
|
+
self._sandbox_id = await self._manager.create_sandbox(sandbox_type=sandbox_type, config=sandbox_config)
|
|
130
|
+
|
|
131
|
+
sandbox_info = await self._manager.get_sandbox_info(self._sandbox_id)
|
|
132
|
+
|
|
133
|
+
logger.info(f'Sandbox of type {sandbox_type} initialized. Info: {sandbox_info.model_dump(exclude_none=True)}')
|
|
134
|
+
return self._sandbox_id
|
|
135
|
+
|
|
136
|
+
def init_sandbox(self) -> Optional[str]:
|
|
137
|
+
"""Initialize the sandbox instance."""
|
|
138
|
+
if self._sandbox_id is not None:
|
|
139
|
+
return self._sandbox_id
|
|
140
|
+
|
|
141
|
+
if not self.use_sandbox:
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
# Use the dedicated loop if available
|
|
145
|
+
if self._loop and not self._loop.is_closed():
|
|
146
|
+
future = asyncio.run_coroutine_threadsafe(self.init_sandbox_async(), self._loop)
|
|
147
|
+
return future.result()
|
|
148
|
+
else:
|
|
149
|
+
# Fallback for cases where no loop is available
|
|
150
|
+
return asyncio.run(self.init_sandbox_async())
|
|
151
|
+
|
|
152
|
+
def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
|
|
153
|
+
"""Execute code in the sandbox."""
|
|
154
|
+
if not self._sandbox_id or not self._manager:
|
|
155
|
+
logger.warning('Sandbox is not initialized.')
|
|
156
|
+
return {'error': 'Sandbox is not initialized.'}
|
|
157
|
+
|
|
158
|
+
from ms_enclave.sandbox.model import ExecutionStatus, ToolResult
|
|
159
|
+
|
|
160
|
+
async def _execute_async():
|
|
161
|
+
if language.lower() == 'python':
|
|
162
|
+
tool_name = 'python_executor'
|
|
163
|
+
parameters = {'code': code, 'timeout': timeout}
|
|
164
|
+
result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
|
|
165
|
+
elif language.lower() == 'shell':
|
|
166
|
+
tool_name = 'shell_executor'
|
|
167
|
+
parameters = {'command': code, 'timeout': timeout}
|
|
168
|
+
result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
|
|
169
|
+
else:
|
|
170
|
+
logger.warning(f"Unsupported language: {language}. Supported languages are 'python' and 'shell'.")
|
|
171
|
+
result = ToolResult(
|
|
172
|
+
status=ExecutionStatus.ERROR,
|
|
173
|
+
tool_name='code_executor',
|
|
174
|
+
output=f"Unsupported language: {language}. Supported languages are 'python' and 'shell'."
|
|
175
|
+
)
|
|
176
|
+
return result
|
|
177
|
+
|
|
178
|
+
# Use the dedicated loop if available
|
|
179
|
+
if self._loop and not self._loop.is_closed():
|
|
180
|
+
future = asyncio.run_coroutine_threadsafe(_execute_async(), self._loop)
|
|
181
|
+
result = future.result(timeout + 10) # Add some buffer to the timeout
|
|
182
|
+
else:
|
|
183
|
+
# Fallback for cases where no loop is available
|
|
184
|
+
result = asyncio.run(_execute_async())
|
|
185
|
+
|
|
186
|
+
return result.model_dump(exclude_none=True)
|
|
187
|
+
|
|
188
|
+
def sandbox_finalize(self, *args, **kwargs):
|
|
189
|
+
"""Finalize the sandbox manager."""
|
|
190
|
+
if self._manager:
|
|
191
|
+
try:
|
|
192
|
+
if self._loop and not self._loop.is_closed():
|
|
193
|
+
# Stop the manager using the dedicated loop
|
|
194
|
+
future = asyncio.run_coroutine_threadsafe(self._manager.stop(), self._loop)
|
|
195
|
+
future.result(timeout=30)
|
|
196
|
+
|
|
197
|
+
# Stop the event loop
|
|
198
|
+
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
199
|
+
if hasattr(self, '_loop_thread'):
|
|
200
|
+
self._loop_thread.join(timeout=5)
|
|
201
|
+
|
|
202
|
+
logger.info('Sandbox manager finalized.')
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.warning(f'Error finalizing sandbox manager: {e}')
|
|
@@ -36,9 +36,6 @@ class GenerateConfig(BaseModel):
|
|
|
36
36
|
stream: Optional[bool] = Field(default=None)
|
|
37
37
|
"""Whether to stream the response (default is model specific)."""
|
|
38
38
|
|
|
39
|
-
system_message: Optional[str] = Field(default=None)
|
|
40
|
-
"""Override the default system message."""
|
|
41
|
-
|
|
42
39
|
max_tokens: Optional[int] = Field(default=None)
|
|
43
40
|
"""The maximum number of tokens that can be generated in the completion (default is model specific)."""
|
|
44
41
|
|
|
@@ -365,7 +365,7 @@ def get_model(
|
|
|
365
365
|
|
|
366
366
|
logger.info(
|
|
367
367
|
f'Creating model {model} with eval_type={eval_type} '
|
|
368
|
-
f'base_url={base_url},
|
|
368
|
+
f'base_url={base_url}, config={config.model_dump(exclude_none=True)}, model_args={model_args}'
|
|
369
369
|
)
|
|
370
370
|
|
|
371
371
|
# find a matching model type
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from docstring_parser import Docstring, parse
|
|
4
|
-
from pydantic import BaseModel, Field
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
|
5
5
|
from typing import Any, Callable, Dict, List, Literal, Optional, TypeAlias, Union, get_args, get_type_hints
|
|
6
6
|
|
|
7
7
|
from evalscope.utils.json_schema import JSONSchema, JSONType, json_schema, python_type_to_json_type
|
|
@@ -87,6 +87,12 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
87
87
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
88
88
|
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
89
89
|
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
90
|
+
|
|
91
|
+
# Sandbox-related arguments
|
|
92
|
+
parser.add_argument('--use-sandbox', action='store_true', default=False, help='Whether to use sandbox for model evaluation.') # noqa: E501
|
|
93
|
+
parser.add_argument('--sandbox-type', type=str, default='docker', help='The sandbox type to use.') # noqa: E501
|
|
94
|
+
parser.add_argument('--sandbox-config', type=json.loads, default='{}', help='The sandbox config, should be a json string.') # noqa: E501
|
|
95
|
+
parser.add_argument('--sandbox-manager-config', type=json.loads, default='{}', help='The sandbox manager config, should be a json string.') # noqa: E501
|
|
90
96
|
# yapf: enable
|
|
91
97
|
|
|
92
98
|
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
SUBSET_LIST = ['default']
|
|
16
|
+
|
|
17
|
+
MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_benchmark(
|
|
21
|
+
BenchmarkMeta(
|
|
22
|
+
name='ai2d',
|
|
23
|
+
pretty_name='AI2D',
|
|
24
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
25
|
+
description='A Diagram Is Worth A Dozen Images',
|
|
26
|
+
dataset_id='lmms-lab/ai2d',
|
|
27
|
+
subset_list=SUBSET_LIST,
|
|
28
|
+
metric_list=['acc'],
|
|
29
|
+
eval_split='test',
|
|
30
|
+
prompt_template=MULT_CHOICE_PROMPT,
|
|
31
|
+
)
|
|
32
|
+
)
|
|
33
|
+
class Ai2dAdapter(VisionLanguageAdapter):
|
|
34
|
+
|
|
35
|
+
def __init__(self, **kwargs):
|
|
36
|
+
super().__init__(**kwargs)
|
|
37
|
+
|
|
38
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
39
|
+
answers_list: list[str] = record['options']
|
|
40
|
+
input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
|
|
41
|
+
content_list: list[Content] = [ContentText(text=input_text)]
|
|
42
|
+
image = record.get('image')
|
|
43
|
+
if image:
|
|
44
|
+
image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
|
|
45
|
+
content_list.append(ContentImage(image=image_base64))
|
|
46
|
+
|
|
47
|
+
label_answer = chr(int(record['answer']) + ord('A'))
|
|
48
|
+
|
|
49
|
+
return Sample(input=[ChatMessageUser(content=content_list)], choices=answers_list, target=label_answer)
|
|
50
|
+
|
|
51
|
+
def extract_answer(self, prediction: str, task_state: TaskState) -> str:
|
|
52
|
+
answers = parse_answers(task_state)
|
|
53
|
+
return ''.join(sorted(list(answers)))
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.constants import Tags
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_benchmark(
|
|
13
|
+
BenchmarkMeta(
|
|
14
|
+
name='amc',
|
|
15
|
+
pretty_name='AMC',
|
|
16
|
+
tags=[Tags.MATH, Tags.REASONING],
|
|
17
|
+
description=
|
|
18
|
+
'AMC (American Mathematics Competitions) is a series of mathematics competitions for high school students.',
|
|
19
|
+
dataset_id='evalscope/amc_22-24',
|
|
20
|
+
subset_list=['amc22', 'amc23', 'amc24'],
|
|
21
|
+
metric_list=[{
|
|
22
|
+
'acc': {
|
|
23
|
+
'numeric': True
|
|
24
|
+
}
|
|
25
|
+
}],
|
|
26
|
+
prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class AMCAdapter(DefaultDataAdapter):
|
|
30
|
+
|
|
31
|
+
def __init__(self, *args, **kwargs):
|
|
32
|
+
super().__init__(*args, **kwargs)
|
|
33
|
+
|
|
34
|
+
# Use split as subset
|
|
35
|
+
self.split_as_subset = True
|
|
36
|
+
|
|
37
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
38
|
+
return Sample(
|
|
39
|
+
input=record['problem'],
|
|
40
|
+
target=record['answer'],
|
|
41
|
+
metadata={
|
|
42
|
+
'year': record['year'],
|
|
43
|
+
'url': record['url'],
|
|
44
|
+
'solution': record.get('solution', '')
|
|
45
|
+
},
|
|
46
|
+
)
|
|
@@ -141,35 +141,61 @@ class BBHAdapter(DefaultDataAdapter):
|
|
|
141
141
|
@classmethod
|
|
142
142
|
def _extract_mc_answer(cls, ans: str) -> str:
|
|
143
143
|
"""
|
|
144
|
-
Extract
|
|
144
|
+
Extract normalized answer for BBH multiple-choice tasks.
|
|
145
|
+
Handles formats like:
|
|
146
|
+
- "answer is (A)"
|
|
147
|
+
- "The answer is A."
|
|
148
|
+
- Extra text after answer.
|
|
149
|
+
Always uses the *last* occurrence of "answer is".
|
|
145
150
|
"""
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
151
|
+
ans = ans.strip()
|
|
152
|
+
|
|
153
|
+
parts = ans.split('So the answer is ')
|
|
154
|
+
if len(parts) > 1:
|
|
155
|
+
ans = parts[-1].strip()
|
|
156
|
+
ans = ans.split('\n')[0].strip()
|
|
157
|
+
|
|
158
|
+
# Remove trailing period
|
|
159
|
+
if ans.endswith('.'):
|
|
160
|
+
ans = ans[:-1].strip()
|
|
161
|
+
|
|
162
|
+
# Capture uppercase letter inside parentheses (A) (B) ...
|
|
163
|
+
match = re.search(r'\(([A-Z])\)', ans)
|
|
150
164
|
if match:
|
|
151
165
|
return match.group(1)
|
|
152
|
-
|
|
166
|
+
|
|
167
|
+
# Capture single uppercase letter
|
|
168
|
+
match = re.search(r'\b([A-Z])\b', ans)
|
|
153
169
|
if match:
|
|
154
170
|
return match.group(1)
|
|
171
|
+
|
|
155
172
|
return ans
|
|
156
173
|
|
|
157
174
|
@classmethod
|
|
158
175
|
def _extract_ff_answer(cls, ans: str):
|
|
159
176
|
"""
|
|
160
|
-
Extract the answer
|
|
177
|
+
Extract the normalized answer for BBH free-form tasks.
|
|
178
|
+
Handles patterns like:
|
|
179
|
+
- "answer is XXX."
|
|
180
|
+
- "The answer is **valid**."
|
|
181
|
+
- Extra trailing dots / line breaks.
|
|
182
|
+
- Bold-marked answers (**xxx**).
|
|
183
|
+
Always uses the *last* occurrence of "answer is".
|
|
161
184
|
"""
|
|
162
|
-
|
|
185
|
+
ans = ans.strip()
|
|
163
186
|
|
|
164
|
-
|
|
165
|
-
if
|
|
166
|
-
|
|
167
|
-
|
|
187
|
+
parts = ans.split('So the answer is ')
|
|
188
|
+
if len(parts) > 1:
|
|
189
|
+
ans = parts[-1].strip()
|
|
190
|
+
ans = ans.split('\n')[0].strip()
|
|
168
191
|
|
|
169
|
-
|
|
170
|
-
if len(ans_line) != 1:
|
|
171
|
-
ans = ans_line[1].strip()
|
|
172
|
-
ans = ans.split('\n')[0]
|
|
192
|
+
# Remove trailing period
|
|
173
193
|
if ans.endswith('.'):
|
|
174
|
-
ans = ans[:-1]
|
|
194
|
+
ans = ans[:-1].strip()
|
|
195
|
+
|
|
196
|
+
# If answer is in bold (**xxx**), prefer the content inside
|
|
197
|
+
match = re.search(r'\*\*(.*?)\*\*', ans)
|
|
198
|
+
if match:
|
|
199
|
+
ans = match.group(1).strip()
|
|
200
|
+
|
|
175
201
|
return ans
|