evalscope 0.16.0__tar.gz → 0.16.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.16.0/evalscope.egg-info → evalscope-0.16.2}/PKG-INFO +16 -13
- {evalscope-0.16.0 → evalscope-0.16.2}/README.md +3 -0
- evalscope-0.16.2/evalscope/app/__init__.py +28 -0
- {evalscope-0.16.0/evalscope/report → evalscope-0.16.2/evalscope/app}/app.py +40 -30
- evalscope-0.16.2/evalscope/app/constants.py +21 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/arguments.py +2 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/backend_manager.py +2 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/embedding.py +77 -39
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arc/arc_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/benchmark.py +2 -0
- evalscope-0.16.2/evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/data_adapter.py +99 -16
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope-0.16.2/evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
- evalscope-0.16.2/evalscope/benchmarks/docmath/utils.py +220 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope-0.16.2/evalscope/benchmarks/frames/frames_adapter.py +91 -0
- evalscope-0.16.2/evalscope/benchmarks/frames/utils.py +37 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope-0.16.2/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
- evalscope-0.16.2/evalscope/benchmarks/needle_haystack/utils.py +79 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/race/race_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/tool_bench/utils.py +5 -4
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/utils.py +25 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/start_app.py +2 -2
- evalscope-0.16.2/evalscope/collections/__init__.py +35 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/collections/evaluator.py +68 -34
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/config.py +8 -2
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/constants.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/evaluator.py +40 -28
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/__init__.py +3 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/llm_judge.py +12 -5
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/math_parser.py +1 -1
- evalscope-0.16.2/evalscope/metrics/t2v_metrics/__init__.py +52 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
- {evalscope-0.16.0/tests/rag → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/__init__.py +2 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/base_adapter.py +31 -27
- evalscope-0.16.2/evalscope/models/adapters/bfcl_adapter.py +244 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/server_adapter.py +80 -23
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/custom/custom_model.py +0 -3
- evalscope-0.16.2/evalscope/models/custom/dummy_model.py +99 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/local_model.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/register.py +2 -1
- evalscope-0.16.2/evalscope/perf/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/arguments.py +4 -2
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/benchmark.py +16 -12
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/main.py +7 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/openai_api.py +2 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope-0.16.2/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/benchmark_util.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/local_server.py +1 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/log_utils.py +12 -5
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/rich_display.py +1 -1
- evalscope-0.16.2/evalscope/report/__init__.py +38 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/report/combinator.py +40 -6
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/report/generator.py +33 -9
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/report/utils.py +84 -4
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/run.py +12 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/summarizer.py +1 -1
- evalscope-0.16.2/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/io_utils.py +59 -2
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/logger.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/utils.py +12 -0
- evalscope-0.16.2/evalscope/version.py +4 -0
- {evalscope-0.16.0 → evalscope-0.16.2/evalscope.egg-info}/PKG-INFO +16 -13
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/SOURCES.txt +16 -2
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/requires.txt +12 -12
- {evalscope-0.16.0 → evalscope-0.16.2}/requirements/aigc.txt +1 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/requirements/framework.txt +2 -3
- evalscope-0.16.2/requirements/opencompass.txt +1 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/requirements/rag.txt +1 -1
- evalscope-0.16.2/requirements/vlmeval.txt +1 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/aigc/test_t2i.py +48 -11
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/cli/test_all.py +14 -3
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/cli/test_collection.py +6 -4
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/cli/test_run.py +50 -25
- evalscope-0.16.2/tests/rag/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/rag/test_clip_benchmark.py +5 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/rag/test_mteb.py +51 -7
- evalscope-0.16.0/evalscope/collections/__init__.py +0 -3
- evalscope-0.16.0/evalscope/metrics/t2v_metrics/__init__.py +0 -66
- evalscope-0.16.0/evalscope/models/custom/dummy_model.py +0 -61
- evalscope-0.16.0/evalscope/report/__init__.py +0 -6
- evalscope-0.16.0/evalscope/version.py +0 -4
- evalscope-0.16.0/requirements/opencompass.txt +0 -1
- evalscope-0.16.0/requirements/vlmeval.txt +0 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/LICENSE +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/MANIFEST.in +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/__init__.py +0 -0
- evalscope-0.16.0/evalscope/report/app_arguments.py → evalscope-0.16.2/evalscope/app/arguments.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arena_hard/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/chinese_simple_qa → evalscope-0.16.2/evalscope/benchmarks/bfcl}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/data_collection → evalscope-0.16.2/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/drop → evalscope-0.16.2/evalscope/benchmarks/data_collection}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/general_mcq → evalscope-0.16.2/evalscope/benchmarks/docmath}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/gpqa → evalscope-0.16.2/evalscope/benchmarks/drop}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/drop/utils.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/ifeval → evalscope-0.16.2/evalscope/benchmarks/frames}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/iquiz → evalscope-0.16.2/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/live_code_bench → evalscope-0.16.2/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/maritime_bench → evalscope-0.16.2/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/math_500 → evalscope-0.16.2/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/mmlu_pro → evalscope-0.16.2/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/mmlu_redux → evalscope-0.16.2/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/musr → evalscope-0.16.2/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/process_bench → evalscope-0.16.2/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/simple_qa → evalscope-0.16.2/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/super_gpqa → evalscope-0.16.2/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/tool_bench → evalscope-0.16.2/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/winogrande → evalscope-0.16.2/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models → evalscope-0.16.2/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.16.2/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.16.2/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.16.2/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/cli.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/collections/schema.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/perf → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-0.16.0/evalscope/perf/utils → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/third_party/thinkbench/tools → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/chat_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/choice_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/custom_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/t2i_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/run_arena.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/deprecation_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/filters.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/import_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/requirements/app.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/requirements/docs.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/requirements/perf.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/requirements.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/setup.cfg +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/setup.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/aigc/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/cli/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/perf/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/perf/test_perf.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/swift/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/test_run_all.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/vlm/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.2}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.16.
|
|
3
|
+
Version: 0.16.2
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -17,12 +17,12 @@ Requires-Python: >=3.8
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: accelerate
|
|
20
|
-
Requires-Dist: datasets
|
|
20
|
+
Requires-Dist: datasets>=3.0
|
|
21
21
|
Requires-Dist: immutabledict
|
|
22
22
|
Requires-Dist: jieba
|
|
23
23
|
Requires-Dist: jsonlines
|
|
24
24
|
Requires-Dist: langdetect
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: latex2sympy2_extended
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
@@ -40,20 +40,19 @@ Requires-Dist: seaborn
|
|
|
40
40
|
Requires-Dist: sympy
|
|
41
41
|
Requires-Dist: tabulate
|
|
42
42
|
Requires-Dist: torch
|
|
43
|
-
Requires-Dist: torchvision
|
|
44
43
|
Requires-Dist: tqdm
|
|
45
44
|
Requires-Dist: transformers>=4.33
|
|
46
45
|
Requires-Dist: word2number
|
|
47
46
|
Provides-Extra: opencompass
|
|
48
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
47
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
49
48
|
Provides-Extra: vlmeval
|
|
50
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
49
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
51
50
|
Provides-Extra: rag
|
|
52
51
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
53
52
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
54
53
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
55
54
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
56
|
-
Requires-Dist: mteb==1.
|
|
55
|
+
Requires-Dist: mteb==1.38.20; extra == "rag"
|
|
57
56
|
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
58
57
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
59
58
|
Provides-Extra: perf
|
|
@@ -73,14 +72,15 @@ Requires-Dist: iopath; extra == "aigc"
|
|
|
73
72
|
Requires-Dist: omegaconf; extra == "aigc"
|
|
74
73
|
Requires-Dist: open_clip_torch; extra == "aigc"
|
|
75
74
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
75
|
+
Requires-Dist: torchvision; extra == "aigc"
|
|
76
76
|
Provides-Extra: all
|
|
77
77
|
Requires-Dist: accelerate; extra == "all"
|
|
78
|
-
Requires-Dist: datasets
|
|
78
|
+
Requires-Dist: datasets>=3.0; extra == "all"
|
|
79
79
|
Requires-Dist: immutabledict; extra == "all"
|
|
80
80
|
Requires-Dist: jieba; extra == "all"
|
|
81
81
|
Requires-Dist: jsonlines; extra == "all"
|
|
82
82
|
Requires-Dist: langdetect; extra == "all"
|
|
83
|
-
Requires-Dist:
|
|
83
|
+
Requires-Dist: latex2sympy2_extended; extra == "all"
|
|
84
84
|
Requires-Dist: matplotlib; extra == "all"
|
|
85
85
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
86
86
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -98,17 +98,16 @@ Requires-Dist: seaborn; extra == "all"
|
|
|
98
98
|
Requires-Dist: sympy; extra == "all"
|
|
99
99
|
Requires-Dist: tabulate; extra == "all"
|
|
100
100
|
Requires-Dist: torch; extra == "all"
|
|
101
|
-
Requires-Dist: torchvision; extra == "all"
|
|
102
101
|
Requires-Dist: tqdm; extra == "all"
|
|
103
102
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
104
103
|
Requires-Dist: word2number; extra == "all"
|
|
105
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
106
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
104
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
|
|
105
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
|
|
107
106
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
108
107
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
109
108
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
110
109
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
111
|
-
Requires-Dist: mteb==1.
|
|
110
|
+
Requires-Dist: mteb==1.38.20; extra == "all"
|
|
112
111
|
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
113
112
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
114
113
|
Requires-Dist: aiohttp; extra == "all"
|
|
@@ -125,6 +124,7 @@ Requires-Dist: iopath; extra == "all"
|
|
|
125
124
|
Requires-Dist: omegaconf; extra == "all"
|
|
126
125
|
Requires-Dist: open_clip_torch; extra == "all"
|
|
127
126
|
Requires-Dist: opencv-python; extra == "all"
|
|
127
|
+
Requires-Dist: torchvision; extra == "all"
|
|
128
128
|
|
|
129
129
|
<p align="center">
|
|
130
130
|
<br>
|
|
@@ -230,6 +230,9 @@ Please scan the QR code below to join our community groups:
|
|
|
230
230
|
|
|
231
231
|
## 🎉 News
|
|
232
232
|
|
|
233
|
+
- 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
234
|
+
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
235
|
+
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
233
236
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
234
237
|
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
235
238
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
@@ -102,6 +102,9 @@ Please scan the QR code below to join our community groups:
|
|
|
102
102
|
|
|
103
103
|
## 🎉 News
|
|
104
104
|
|
|
105
|
+
- 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
106
|
+
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
107
|
+
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
105
108
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
106
109
|
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
107
110
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .app import create_app
|
|
8
|
+
from .arguments import add_argument
|
|
9
|
+
|
|
10
|
+
else:
|
|
11
|
+
_import_structure = {
|
|
12
|
+
'app': [
|
|
13
|
+
'create_app',
|
|
14
|
+
],
|
|
15
|
+
'arguments': [
|
|
16
|
+
'add_argument',
|
|
17
|
+
],
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
sys.modules[__name__] = _LazyModule(
|
|
23
|
+
__name__,
|
|
24
|
+
globals()['__file__'],
|
|
25
|
+
_import_structure,
|
|
26
|
+
module_spec=__spec__,
|
|
27
|
+
extra_objects={},
|
|
28
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import glob
|
|
3
3
|
import gradio as gr
|
|
4
|
+
import json
|
|
4
5
|
import numpy as np
|
|
5
6
|
import os
|
|
6
7
|
import pandas as pd
|
|
@@ -11,35 +12,15 @@ from dataclasses import dataclass
|
|
|
11
12
|
from typing import Any, List, Union
|
|
12
13
|
|
|
13
14
|
from evalscope.constants import DataCollection
|
|
14
|
-
from evalscope.report import Report, ReportKey,
|
|
15
|
+
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
15
16
|
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
16
17
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
17
18
|
from evalscope.version import __version__
|
|
19
|
+
from .arguments import add_argument
|
|
20
|
+
from .constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, PLOTLY_THEME, REPORT_TOKEN
|
|
18
21
|
|
|
19
22
|
logger = get_logger()
|
|
20
23
|
|
|
21
|
-
PLOTLY_THEME = 'plotly_dark'
|
|
22
|
-
REPORT_TOKEN = '@@'
|
|
23
|
-
MODEL_TOKEN = '::'
|
|
24
|
-
DATASET_TOKEN = ', '
|
|
25
|
-
LATEX_DELIMITERS = [{
|
|
26
|
-
'left': '$$',
|
|
27
|
-
'right': '$$',
|
|
28
|
-
'display': True
|
|
29
|
-
}, {
|
|
30
|
-
'left': '$',
|
|
31
|
-
'right': '$',
|
|
32
|
-
'display': False
|
|
33
|
-
}, {
|
|
34
|
-
'left': '\\(',
|
|
35
|
-
'right': '\\)',
|
|
36
|
-
'display': False
|
|
37
|
-
}, {
|
|
38
|
-
'left': '\\[',
|
|
39
|
-
'right': '\\]',
|
|
40
|
-
'display': True
|
|
41
|
-
}]
|
|
42
|
-
|
|
43
24
|
|
|
44
25
|
def scan_for_report_folders(root_path):
|
|
45
26
|
"""Scan for folders containing reports subdirectories"""
|
|
@@ -155,11 +136,11 @@ def plot_single_report_scores(df: pd.DataFrame):
|
|
|
155
136
|
|
|
156
137
|
def plot_single_report_sunburst(report_list: List[Report]):
|
|
157
138
|
if report_list[0].name == DataCollection.NAME:
|
|
158
|
-
df = get_data_frame(report_list)
|
|
139
|
+
df = get_data_frame(report_list=report_list)
|
|
159
140
|
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
160
141
|
path = categories + [ReportKey.subset_name]
|
|
161
142
|
else:
|
|
162
|
-
df = get_data_frame(report_list, flatten_metrics=False)
|
|
143
|
+
df = get_data_frame(report_list=report_list, flatten_metrics=False)
|
|
163
144
|
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
164
145
|
path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
|
|
165
146
|
logger.debug(f'df: {df}')
|
|
@@ -185,6 +166,13 @@ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
|
|
|
185
166
|
return df, styler
|
|
186
167
|
|
|
187
168
|
|
|
169
|
+
def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
|
|
170
|
+
for report in report_list:
|
|
171
|
+
if report.dataset_name == dataset_name:
|
|
172
|
+
return report.analysis
|
|
173
|
+
return 'N/A'
|
|
174
|
+
|
|
175
|
+
|
|
188
176
|
def plot_single_dataset_scores(df: pd.DataFrame):
|
|
189
177
|
# TODO: add metric radio and relace category name
|
|
190
178
|
plot = px.bar(
|
|
@@ -246,7 +234,7 @@ def convert_html_tags(text):
|
|
|
246
234
|
def process_string(string: str, max_length: int = 2048) -> str:
|
|
247
235
|
string = convert_html_tags(string) # for display labels e.g.
|
|
248
236
|
if max_length and len(string) > max_length:
|
|
249
|
-
return f'{string[:max_length // 2]}
|
|
237
|
+
return f'{string[:max_length // 2]}...[truncate]...{string[-max_length // 2:]}'
|
|
250
238
|
return string
|
|
251
239
|
|
|
252
240
|
|
|
@@ -270,7 +258,7 @@ def dict_to_markdown(data) -> str:
|
|
|
270
258
|
return '\n\n'.join(markdown_lines)
|
|
271
259
|
|
|
272
260
|
|
|
273
|
-
def
|
|
261
|
+
def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
|
|
274
262
|
"""
|
|
275
263
|
Process model prediction output into a formatted string.
|
|
276
264
|
|
|
@@ -294,6 +282,20 @@ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
|
|
|
294
282
|
return result
|
|
295
283
|
|
|
296
284
|
|
|
285
|
+
def process_model_prediction(item: Any, max_length: int = 4096) -> str:
|
|
286
|
+
if isinstance(item, (dict, list)):
|
|
287
|
+
result = json.dumps(item, ensure_ascii=False, indent=2)
|
|
288
|
+
result = f'```json\n{result}\n```'
|
|
289
|
+
else:
|
|
290
|
+
result = str(item)
|
|
291
|
+
|
|
292
|
+
# Apply HTML tag conversion and truncation only at the final output
|
|
293
|
+
if max_length is not None:
|
|
294
|
+
return process_string(result, max_length)
|
|
295
|
+
|
|
296
|
+
return result
|
|
297
|
+
|
|
298
|
+
|
|
297
299
|
def normalize_score(score):
|
|
298
300
|
try:
|
|
299
301
|
if isinstance(score, bool):
|
|
@@ -456,6 +458,10 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
456
458
|
'zh': '数据集分数',
|
|
457
459
|
'en': 'Dataset Scores'
|
|
458
460
|
},
|
|
461
|
+
'report_analysis': {
|
|
462
|
+
'zh': '报告智能分析',
|
|
463
|
+
'en': 'Report Intelligent Analysis'
|
|
464
|
+
},
|
|
459
465
|
'dataset_scores_table': {
|
|
460
466
|
'zh': '数据集分数表',
|
|
461
467
|
'en': 'Dataset Scores Table'
|
|
@@ -511,6 +517,9 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
511
517
|
with gr.Tab(locale_dict['dataset_details'][lang]):
|
|
512
518
|
dataset_radio = gr.Radio(
|
|
513
519
|
label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
|
|
520
|
+
# show dataset details
|
|
521
|
+
with gr.Accordion(locale_dict['report_analysis'][lang], open=True):
|
|
522
|
+
report_analysis = gr.Markdown(value='N/A', show_copy_button=True)
|
|
514
523
|
gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
|
|
515
524
|
dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
|
|
516
525
|
gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
|
|
@@ -586,15 +595,16 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
586
595
|
@gr.on(
|
|
587
596
|
triggers=[dataset_radio.change, report_list.change],
|
|
588
597
|
inputs=[dataset_radio, report_list],
|
|
589
|
-
outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
|
|
598
|
+
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
|
|
590
599
|
def update_single_report_dataset(dataset_name, report_list):
|
|
591
600
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
592
|
-
report_df = get_data_frame(report_list)
|
|
601
|
+
report_df = get_data_frame(report_list=report_list)
|
|
602
|
+
analysis = get_report_analysis(report_list, dataset_name)
|
|
593
603
|
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
|
|
594
604
|
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
595
605
|
subsets = data_score_df[ReportKey.subset_name].unique().tolist()
|
|
596
606
|
logger.debug(f'subsets: {subsets}')
|
|
597
|
-
return data_score_plot, styler, gr.update(choices=subsets, value=None), None
|
|
607
|
+
return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
|
|
598
608
|
|
|
599
609
|
@gr.on(
|
|
600
610
|
triggers=[subset_select.change],
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
PLOTLY_THEME = 'plotly_dark'
|
|
2
|
+
REPORT_TOKEN = '@@'
|
|
3
|
+
MODEL_TOKEN = '::'
|
|
4
|
+
DATASET_TOKEN = ', '
|
|
5
|
+
LATEX_DELIMITERS = [{
|
|
6
|
+
'left': '$$',
|
|
7
|
+
'right': '$$',
|
|
8
|
+
'display': True
|
|
9
|
+
}, {
|
|
10
|
+
'left': '$',
|
|
11
|
+
'right': '$',
|
|
12
|
+
'display': False
|
|
13
|
+
}, {
|
|
14
|
+
'left': '\\(',
|
|
15
|
+
'right': '\\)',
|
|
16
|
+
'display': False
|
|
17
|
+
}, {
|
|
18
|
+
'left': '\\[',
|
|
19
|
+
'right': '\\]',
|
|
20
|
+
'display': True
|
|
21
|
+
}]
|
|
@@ -67,7 +67,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
67
67
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
68
68
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
69
69
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
70
|
-
parser.add_argument('--limit', type=
|
|
70
|
+
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
71
71
|
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
72
72
|
|
|
73
73
|
# Cache and working directory arguments
|
|
@@ -89,6 +89,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
89
89
|
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
90
90
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
91
91
|
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
92
|
+
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
92
93
|
# yapf: enable
|
|
93
94
|
|
|
94
95
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
2
3
|
import subprocess
|
|
3
4
|
import tempfile
|
|
4
5
|
from dataclasses import asdict
|
|
@@ -204,7 +205,7 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
204
205
|
model_d['meta_template'] = get_template(model_d['meta_template'])
|
|
205
206
|
|
|
206
207
|
# set the 'abbr' as the 'path' if 'abbr' is not specified
|
|
207
|
-
model_d['abbr'] = model_d['path']
|
|
208
|
+
model_d['abbr'] = os.path.basename(model_d['path'])
|
|
208
209
|
|
|
209
210
|
model_config = ApiModelConfig(**model_d)
|
|
210
211
|
models.append(asdict(model_config))
|
{evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py
RENAMED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import posixpath # For URL path handling
|
|
2
3
|
import torch
|
|
3
4
|
from torch.utils.data import DataLoader
|
|
4
5
|
from torch.utils.data import Dataset as TorchDataset
|
|
@@ -186,42 +187,53 @@ def build_wds_dataset(dataset_name, transform, split='test', data_dir='root', ca
|
|
|
186
187
|
|
|
187
188
|
Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
|
|
188
189
|
"""
|
|
190
|
+
import requests
|
|
189
191
|
import webdataset as wds
|
|
190
192
|
|
|
191
193
|
def read_txt(fname):
|
|
192
|
-
if '://'
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
194
|
+
if fname.startswith(('http://', 'https://')):
|
|
195
|
+
try:
|
|
196
|
+
response = requests.get(fname)
|
|
197
|
+
response.raise_for_status() # Ensure the HTTP request was successful
|
|
198
|
+
return response.text
|
|
199
|
+
except requests.exceptions.RequestException as e:
|
|
200
|
+
raise FileNotFoundError(f'Failed to read {fname}: {e}')
|
|
197
201
|
else:
|
|
198
202
|
with open(fname, 'r') as file:
|
|
199
|
-
|
|
200
|
-
|
|
203
|
+
return file.read()
|
|
204
|
+
|
|
205
|
+
def url_path_join(*parts):
|
|
206
|
+
"""Join URL path parts with forward slashes regardless of platform"""
|
|
207
|
+
return posixpath.join(*parts)
|
|
201
208
|
|
|
202
209
|
if not data_dir:
|
|
203
210
|
data_dir = f'https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master'
|
|
204
211
|
|
|
205
212
|
# Git LFS files have a different file path to access the raw data than other files
|
|
206
|
-
|
|
213
|
+
is_url = data_dir.startswith(('http://', 'https://'))
|
|
214
|
+
if is_url and data_dir.startswith('https://modelscope.cn/datasets'):
|
|
207
215
|
*split_url_head, _, url_path = data_dir.split('/', 7)
|
|
208
216
|
url_head = '/'.join(split_url_head)
|
|
209
217
|
metadata_dir = '/'.join([url_head, 'resolve', url_path])
|
|
210
218
|
tardata_dir = '/'.join([url_head, 'resolve', url_path])
|
|
211
219
|
else:
|
|
212
220
|
metadata_dir = tardata_dir = data_dir
|
|
221
|
+
|
|
222
|
+
# Use appropriate path joining function based on whether we're dealing with a URL
|
|
223
|
+
path_join = url_path_join if is_url else os.path.join
|
|
224
|
+
|
|
213
225
|
# Get number of shards
|
|
214
|
-
nshards_fname =
|
|
226
|
+
nshards_fname = path_join(metadata_dir, split, 'nshards.txt')
|
|
215
227
|
nshards = int(read_txt(nshards_fname)) # Do not catch FileNotFound, nshards.txt should be mandatory
|
|
216
228
|
|
|
217
229
|
# Get dataset type (classification or retrieval)
|
|
218
|
-
type_fname =
|
|
230
|
+
type_fname = path_join(metadata_dir, 'dataset_type.txt')
|
|
219
231
|
try:
|
|
220
232
|
dataset_type = read_txt(type_fname).strip().lower()
|
|
221
233
|
except FileNotFoundError:
|
|
222
234
|
dataset_type = 'classification'
|
|
223
235
|
|
|
224
|
-
filepattern =
|
|
236
|
+
filepattern = path_join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
|
|
225
237
|
# Load webdataset (support WEBP, PNG, and JPG for now)
|
|
226
238
|
if not cache_dir or not isinstance(cache_dir, str):
|
|
227
239
|
cache_dir = None
|
|
@@ -11,7 +11,9 @@ class ModelArguments:
|
|
|
11
11
|
pooling_mode: Optional[str] = None
|
|
12
12
|
max_seq_length: int = 512 # max sequence length
|
|
13
13
|
# prompt for llm based model
|
|
14
|
-
prompt: str =
|
|
14
|
+
prompt: Optional[str] = None
|
|
15
|
+
# prompts dictionary for different tasks, if prompt is not set
|
|
16
|
+
prompts: Optional[Dict[str, str]] = None
|
|
15
17
|
# model kwargs
|
|
16
18
|
model_kwargs: dict = field(default_factory=dict)
|
|
17
19
|
# config kwargs
|
|
@@ -33,6 +35,7 @@ class ModelArguments:
|
|
|
33
35
|
'pooling_mode': self.pooling_mode,
|
|
34
36
|
'max_seq_length': self.max_seq_length,
|
|
35
37
|
'prompt': self.prompt,
|
|
38
|
+
'prompts': self.prompts,
|
|
36
39
|
'model_kwargs': self.model_kwargs,
|
|
37
40
|
'config_kwargs': self.config_kwargs,
|
|
38
41
|
'encode_kwargs': self.encode_kwargs,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import mteb
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
3
|
+
from tabulate import tabulate
|
|
4
4
|
|
|
5
5
|
from evalscope.backend.rag_eval import EmbeddingModel, cmteb
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
@@ -12,14 +12,27 @@ def show_results(output_folder, model, results):
|
|
|
12
12
|
model_name = model.mteb_model_meta.model_name_as_path()
|
|
13
13
|
revision = model.mteb_model_meta.revision
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
data = []
|
|
16
|
+
for model_res in results:
|
|
17
|
+
main_res = model_res.only_main_score()
|
|
18
|
+
for split, score in main_res.scores.items():
|
|
19
|
+
for sub_score in score:
|
|
20
|
+
data.append({
|
|
21
|
+
'Model': model_name.replace('eval__', ''),
|
|
22
|
+
'Revision': revision,
|
|
23
|
+
'Task Type': main_res.task_type,
|
|
24
|
+
'Task': main_res.task_name,
|
|
25
|
+
'Split': split,
|
|
26
|
+
'Subset': sub_score['hf_subset'],
|
|
27
|
+
'Main Score': sub_score['main_score'],
|
|
28
|
+
})
|
|
16
29
|
|
|
17
30
|
save_path = os.path.join(
|
|
18
31
|
output_folder,
|
|
19
32
|
model_name,
|
|
20
33
|
revision,
|
|
21
34
|
)
|
|
22
|
-
logger.info(f'Evaluation results:\n{
|
|
35
|
+
logger.info(f'Evaluation results:\n{tabulate(data, headers="keys", tablefmt="grid")}')
|
|
23
36
|
logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
|
|
24
37
|
|
|
25
38
|
|
|
@@ -34,6 +47,7 @@ def one_stage_eval(
|
|
|
34
47
|
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
|
|
35
48
|
evaluation = mteb.MTEB(tasks=tasks)
|
|
36
49
|
|
|
50
|
+
eval_args['encode_kwargs'] = model_args.get('encode_kwargs', {})
|
|
37
51
|
# run evaluation
|
|
38
52
|
results = evaluation.run(model, **eval_args)
|
|
39
53
|
|
|
@@ -66,6 +80,7 @@ def two_stage_eval(
|
|
|
66
80
|
overwrite_results=True,
|
|
67
81
|
hub=eval_args['hub'],
|
|
68
82
|
limits=eval_args['limits'],
|
|
83
|
+
encode_kwargs=model1_args.get('encode_kwargs', {}),
|
|
69
84
|
)
|
|
70
85
|
# stage 2: run cross encoder
|
|
71
86
|
results = evaluation.run(
|
|
@@ -77,6 +92,7 @@ def two_stage_eval(
|
|
|
77
92
|
overwrite_results=True,
|
|
78
93
|
hub=eval_args['hub'],
|
|
79
94
|
limits=eval_args['limits'],
|
|
95
|
+
encode_kwargs=model2_args.get('encode_kwargs', {}),
|
|
80
96
|
)
|
|
81
97
|
|
|
82
98
|
# save and log results
|