evalscope 0.16.0__tar.gz → 0.16.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.16.0/evalscope.egg-info → evalscope-0.16.1}/PKG-INFO +13 -11
- {evalscope-0.16.0 → evalscope-0.16.1}/README.md +2 -0
- evalscope-0.16.1/evalscope/app/__init__.py +28 -0
- {evalscope-0.16.0/evalscope/report → evalscope-0.16.1/evalscope/app}/app.py +20 -25
- evalscope-0.16.1/evalscope/app/constants.py +21 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/arguments.py +2 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/opencompass/backend_manager.py +2 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/embedding.py +75 -35
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/benchmark.py +1 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/data_adapter.py +97 -16
- evalscope-0.16.1/evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope-0.16.1/evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope-0.16.1/evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope-0.16.1/evalscope/benchmarks/frames/utils.py +37 -0
- evalscope-0.16.1/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope-0.16.1/evalscope/benchmarks/needle_haystack/utils.py +79 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +4 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/tool_bench/utils.py +5 -4
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/utils.py +25 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/cli/start_app.py +2 -2
- evalscope-0.16.1/evalscope/collections/__init__.py +35 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/collections/evaluator.py +18 -6
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/config.py +8 -2
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/evaluator/evaluator.py +38 -27
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/__init__.py +3 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/llm_judge.py +12 -5
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/math_parser.py +1 -1
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/third_party/thinkbench/tools → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/adapters/server_adapter.py +2 -6
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/arguments.py +2 -2
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/benchmark.py +0 -9
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/main.py +7 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope-0.16.1/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/utils/benchmark_util.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/utils/local_server.py +1 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/utils/log_utils.py +12 -5
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/utils/rich_display.py +1 -1
- evalscope-0.16.1/evalscope/report/__init__.py +38 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/report/combinator.py +8 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/report/generator.py +33 -9
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/report/utils.py +60 -3
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/run.py +12 -0
- evalscope-0.16.1/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/logger.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/utils.py +12 -0
- evalscope-0.16.1/evalscope/version.py +4 -0
- {evalscope-0.16.0 → evalscope-0.16.1/evalscope.egg-info}/PKG-INFO +13 -11
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope.egg-info/SOURCES.txt +13 -2
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope.egg-info/requires.txt +10 -10
- {evalscope-0.16.0 → evalscope-0.16.1}/requirements/framework.txt +2 -2
- evalscope-0.16.1/requirements/opencompass.txt +1 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/requirements/rag.txt +1 -1
- evalscope-0.16.1/requirements/vlmeval.txt +1 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/aigc/test_t2i.py +40 -3
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/cli/test_all.py +39 -35
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/cli/test_collection.py +7 -6
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/cli/test_run.py +21 -11
- evalscope-0.16.1/tests/rag/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/rag/test_mteb.py +5 -5
- evalscope-0.16.0/evalscope/collections/__init__.py +0 -3
- evalscope-0.16.0/evalscope/report/__init__.py +0 -6
- evalscope-0.16.0/evalscope/version.py +0 -4
- evalscope-0.16.0/requirements/opencompass.txt +0 -1
- evalscope-0.16.0/requirements/vlmeval.txt +0 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/LICENSE +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/MANIFEST.in +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/__init__.py +0 -0
- evalscope-0.16.0/evalscope/report/app_arguments.py → evalscope-0.16.1/evalscope/app/arguments.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aigc/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/drop → evalscope-0.16.1/evalscope/benchmarks/docmath}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/general_mcq → evalscope-0.16.1/evalscope/benchmarks/drop}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/drop/drop_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/drop/utils.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/gpqa → evalscope-0.16.1/evalscope/benchmarks/frames}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/ifeval → evalscope-0.16.1/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/iquiz → evalscope-0.16.1/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/live_code_bench → evalscope-0.16.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/maritime_bench → evalscope-0.16.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/math_500 → evalscope-0.16.1/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/mmlu_pro → evalscope-0.16.1/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/mmlu_redux → evalscope-0.16.1/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/musr → evalscope-0.16.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/process_bench → evalscope-0.16.1/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/simple_qa → evalscope-0.16.1/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/super_gpqa → evalscope-0.16.1/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/tool_bench → evalscope-0.16.1/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.16.0/evalscope/benchmarks/winogrande → evalscope-0.16.1/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models → evalscope-0.16.1/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.16.1/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.16.1/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/benchmarks/winogrande/winogrande_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/collections/schema.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/constants.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-0.16.0/evalscope/perf → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.16.0/evalscope/perf/utils → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/adapters/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/adapters/base_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/adapters/chat_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/adapters/choice_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/adapters/custom_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/adapters/t2i_adapter.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/local_model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/model.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/models/register.py +0 -0
- {evalscope-0.16.0/tests/rag → evalscope-0.16.1/evalscope/perf}/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/run_arena.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/summarizer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/deprecation_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/filters.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/import_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/io_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/requirements/aigc.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/requirements/app.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/requirements/docs.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/requirements/perf.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/requirements.txt +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/setup.cfg +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/setup.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/aigc/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/cli/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/perf/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/perf/test_perf.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/swift/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/test_run_all.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/vlm/__init__.py +0 -0
- {evalscope-0.16.0 → evalscope-0.16.1}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.16.
|
|
3
|
+
Version: 0.16.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -17,12 +17,12 @@ Requires-Python: >=3.8
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: accelerate
|
|
20
|
-
Requires-Dist: datasets
|
|
20
|
+
Requires-Dist: datasets>=3.0
|
|
21
21
|
Requires-Dist: immutabledict
|
|
22
22
|
Requires-Dist: jieba
|
|
23
23
|
Requires-Dist: jsonlines
|
|
24
24
|
Requires-Dist: langdetect
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: latex2sympy2_extended
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
@@ -45,15 +45,15 @@ Requires-Dist: tqdm
|
|
|
45
45
|
Requires-Dist: transformers>=4.33
|
|
46
46
|
Requires-Dist: word2number
|
|
47
47
|
Provides-Extra: opencompass
|
|
48
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
48
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
49
49
|
Provides-Extra: vlmeval
|
|
50
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
50
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
51
51
|
Provides-Extra: rag
|
|
52
52
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
53
53
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
54
54
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
55
55
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
56
|
-
Requires-Dist: mteb==1.
|
|
56
|
+
Requires-Dist: mteb==1.38.20; extra == "rag"
|
|
57
57
|
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
58
58
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
59
59
|
Provides-Extra: perf
|
|
@@ -75,12 +75,12 @@ Requires-Dist: open_clip_torch; extra == "aigc"
|
|
|
75
75
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
76
76
|
Provides-Extra: all
|
|
77
77
|
Requires-Dist: accelerate; extra == "all"
|
|
78
|
-
Requires-Dist: datasets
|
|
78
|
+
Requires-Dist: datasets>=3.0; extra == "all"
|
|
79
79
|
Requires-Dist: immutabledict; extra == "all"
|
|
80
80
|
Requires-Dist: jieba; extra == "all"
|
|
81
81
|
Requires-Dist: jsonlines; extra == "all"
|
|
82
82
|
Requires-Dist: langdetect; extra == "all"
|
|
83
|
-
Requires-Dist:
|
|
83
|
+
Requires-Dist: latex2sympy2_extended; extra == "all"
|
|
84
84
|
Requires-Dist: matplotlib; extra == "all"
|
|
85
85
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
86
86
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -102,13 +102,13 @@ Requires-Dist: torchvision; extra == "all"
|
|
|
102
102
|
Requires-Dist: tqdm; extra == "all"
|
|
103
103
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
104
104
|
Requires-Dist: word2number; extra == "all"
|
|
105
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
106
|
-
Requires-Dist: ms-vlmeval>=0.0.
|
|
105
|
+
Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
|
|
106
|
+
Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
|
|
107
107
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
108
108
|
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
109
109
|
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
110
110
|
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
111
|
-
Requires-Dist: mteb==1.
|
|
111
|
+
Requires-Dist: mteb==1.38.20; extra == "all"
|
|
112
112
|
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
113
113
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
114
114
|
Requires-Dist: aiohttp; extra == "all"
|
|
@@ -230,6 +230,8 @@ Please scan the QR code below to join our community groups:
|
|
|
230
230
|
|
|
231
231
|
## 🎉 News
|
|
232
232
|
|
|
233
|
+
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
234
|
+
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
233
235
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
234
236
|
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
235
237
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
@@ -102,6 +102,8 @@ Please scan the QR code below to join our community groups:
|
|
|
102
102
|
|
|
103
103
|
## 🎉 News
|
|
104
104
|
|
|
105
|
+
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
106
|
+
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
105
107
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
106
108
|
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
107
109
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .app import create_app
|
|
8
|
+
from .arguments import add_argument
|
|
9
|
+
|
|
10
|
+
else:
|
|
11
|
+
_import_structure = {
|
|
12
|
+
'app': [
|
|
13
|
+
'create_app',
|
|
14
|
+
],
|
|
15
|
+
'arguments': [
|
|
16
|
+
'add_argument',
|
|
17
|
+
],
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
sys.modules[__name__] = _LazyModule(
|
|
23
|
+
__name__,
|
|
24
|
+
globals()['__file__'],
|
|
25
|
+
_import_structure,
|
|
26
|
+
module_spec=__spec__,
|
|
27
|
+
extra_objects={},
|
|
28
|
+
)
|
|
@@ -11,35 +11,15 @@ from dataclasses import dataclass
|
|
|
11
11
|
from typing import Any, List, Union
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import DataCollection
|
|
14
|
-
from evalscope.report import Report, ReportKey,
|
|
14
|
+
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
15
15
|
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
16
16
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
17
17
|
from evalscope.version import __version__
|
|
18
|
+
from .arguments import add_argument
|
|
19
|
+
from .constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, PLOTLY_THEME, REPORT_TOKEN
|
|
18
20
|
|
|
19
21
|
logger = get_logger()
|
|
20
22
|
|
|
21
|
-
PLOTLY_THEME = 'plotly_dark'
|
|
22
|
-
REPORT_TOKEN = '@@'
|
|
23
|
-
MODEL_TOKEN = '::'
|
|
24
|
-
DATASET_TOKEN = ', '
|
|
25
|
-
LATEX_DELIMITERS = [{
|
|
26
|
-
'left': '$$',
|
|
27
|
-
'right': '$$',
|
|
28
|
-
'display': True
|
|
29
|
-
}, {
|
|
30
|
-
'left': '$',
|
|
31
|
-
'right': '$',
|
|
32
|
-
'display': False
|
|
33
|
-
}, {
|
|
34
|
-
'left': '\\(',
|
|
35
|
-
'right': '\\)',
|
|
36
|
-
'display': False
|
|
37
|
-
}, {
|
|
38
|
-
'left': '\\[',
|
|
39
|
-
'right': '\\]',
|
|
40
|
-
'display': True
|
|
41
|
-
}]
|
|
42
|
-
|
|
43
23
|
|
|
44
24
|
def scan_for_report_folders(root_path):
|
|
45
25
|
"""Scan for folders containing reports subdirectories"""
|
|
@@ -185,6 +165,13 @@ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
|
|
|
185
165
|
return df, styler
|
|
186
166
|
|
|
187
167
|
|
|
168
|
+
def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
|
|
169
|
+
for report in report_list:
|
|
170
|
+
if report.dataset_name == dataset_name:
|
|
171
|
+
return report.analysis
|
|
172
|
+
return 'N/A'
|
|
173
|
+
|
|
174
|
+
|
|
188
175
|
def plot_single_dataset_scores(df: pd.DataFrame):
|
|
189
176
|
# TODO: add metric radio and relace category name
|
|
190
177
|
plot = px.bar(
|
|
@@ -456,6 +443,10 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
456
443
|
'zh': '数据集分数',
|
|
457
444
|
'en': 'Dataset Scores'
|
|
458
445
|
},
|
|
446
|
+
'report_analysis': {
|
|
447
|
+
'zh': '报告智能分析',
|
|
448
|
+
'en': 'Report Intelligent Analysis'
|
|
449
|
+
},
|
|
459
450
|
'dataset_scores_table': {
|
|
460
451
|
'zh': '数据集分数表',
|
|
461
452
|
'en': 'Dataset Scores Table'
|
|
@@ -511,6 +502,9 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
511
502
|
with gr.Tab(locale_dict['dataset_details'][lang]):
|
|
512
503
|
dataset_radio = gr.Radio(
|
|
513
504
|
label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
|
|
505
|
+
# show dataset details
|
|
506
|
+
with gr.Accordion(locale_dict['report_analysis'][lang], open=True):
|
|
507
|
+
report_analysis = gr.Markdown(value='N/A', show_copy_button=True)
|
|
514
508
|
gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
|
|
515
509
|
dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
|
|
516
510
|
gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
|
|
@@ -586,15 +580,16 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
586
580
|
@gr.on(
|
|
587
581
|
triggers=[dataset_radio.change, report_list.change],
|
|
588
582
|
inputs=[dataset_radio, report_list],
|
|
589
|
-
outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
|
|
583
|
+
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
|
|
590
584
|
def update_single_report_dataset(dataset_name, report_list):
|
|
591
585
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
592
586
|
report_df = get_data_frame(report_list)
|
|
587
|
+
analysis = get_report_analysis(report_list, dataset_name)
|
|
593
588
|
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
|
|
594
589
|
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
595
590
|
subsets = data_score_df[ReportKey.subset_name].unique().tolist()
|
|
596
591
|
logger.debug(f'subsets: {subsets}')
|
|
597
|
-
return data_score_plot, styler, gr.update(choices=subsets, value=None), None
|
|
592
|
+
return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
|
|
598
593
|
|
|
599
594
|
@gr.on(
|
|
600
595
|
triggers=[subset_select.change],
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
PLOTLY_THEME = 'plotly_dark'
|
|
2
|
+
REPORT_TOKEN = '@@'
|
|
3
|
+
MODEL_TOKEN = '::'
|
|
4
|
+
DATASET_TOKEN = ', '
|
|
5
|
+
LATEX_DELIMITERS = [{
|
|
6
|
+
'left': '$$',
|
|
7
|
+
'right': '$$',
|
|
8
|
+
'display': True
|
|
9
|
+
}, {
|
|
10
|
+
'left': '$',
|
|
11
|
+
'right': '$',
|
|
12
|
+
'display': False
|
|
13
|
+
}, {
|
|
14
|
+
'left': '\\(',
|
|
15
|
+
'right': '\\)',
|
|
16
|
+
'display': False
|
|
17
|
+
}, {
|
|
18
|
+
'left': '\\[',
|
|
19
|
+
'right': '\\]',
|
|
20
|
+
'display': True
|
|
21
|
+
}]
|
|
@@ -67,7 +67,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
67
67
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
68
68
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
69
69
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
70
|
-
parser.add_argument('--limit', type=
|
|
70
|
+
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
71
71
|
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
72
72
|
|
|
73
73
|
# Cache and working directory arguments
|
|
@@ -89,6 +89,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
89
89
|
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
90
90
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
91
91
|
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
92
|
+
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
92
93
|
# yapf: enable
|
|
93
94
|
|
|
94
95
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
2
3
|
import subprocess
|
|
3
4
|
import tempfile
|
|
4
5
|
from dataclasses import asdict
|
|
@@ -204,7 +205,7 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
204
205
|
model_d['meta_template'] = get_template(model_d['meta_template'])
|
|
205
206
|
|
|
206
207
|
# set the 'abbr' as the 'path' if 'abbr' is not specified
|
|
207
|
-
model_d['abbr'] = model_d['path']
|
|
208
|
+
model_d['abbr'] = os.path.basename(model_d['path'])
|
|
208
209
|
|
|
209
210
|
model_config = ApiModelConfig(**model_d)
|
|
210
211
|
models.append(asdict(model_config))
|
|
@@ -11,7 +11,9 @@ class ModelArguments:
|
|
|
11
11
|
pooling_mode: Optional[str] = None
|
|
12
12
|
max_seq_length: int = 512 # max sequence length
|
|
13
13
|
# prompt for llm based model
|
|
14
|
-
prompt: str =
|
|
14
|
+
prompt: Optional[str] = None
|
|
15
|
+
# prompts dictionary for different tasks, if prompt is not set
|
|
16
|
+
prompts: Optional[Dict[str, str]] = None
|
|
15
17
|
# model kwargs
|
|
16
18
|
model_kwargs: dict = field(default_factory=dict)
|
|
17
19
|
# config kwargs
|
|
@@ -33,6 +35,7 @@ class ModelArguments:
|
|
|
33
35
|
'pooling_mode': self.pooling_mode,
|
|
34
36
|
'max_seq_length': self.max_seq_length,
|
|
35
37
|
'prompt': self.prompt,
|
|
38
|
+
'prompts': self.prompts,
|
|
36
39
|
'model_kwargs': self.model_kwargs,
|
|
37
40
|
'config_kwargs': self.config_kwargs,
|
|
38
41
|
'encode_kwargs': self.encode_kwargs,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import mteb
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
3
|
+
from tabulate import tabulate
|
|
4
4
|
|
|
5
5
|
from evalscope.backend.rag_eval import EmbeddingModel, cmteb
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
@@ -12,14 +12,27 @@ def show_results(output_folder, model, results):
|
|
|
12
12
|
model_name = model.mteb_model_meta.model_name_as_path()
|
|
13
13
|
revision = model.mteb_model_meta.revision
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
data = []
|
|
16
|
+
for model_res in results:
|
|
17
|
+
main_res = model_res.only_main_score()
|
|
18
|
+
for split, score in main_res.scores.items():
|
|
19
|
+
for sub_score in score:
|
|
20
|
+
data.append({
|
|
21
|
+
'Model': model_name.replace('eval__', ''),
|
|
22
|
+
'Revision': revision,
|
|
23
|
+
'Task Type': main_res.task_type,
|
|
24
|
+
'Task': main_res.task_name,
|
|
25
|
+
'Split': split,
|
|
26
|
+
'Subset': sub_score['hf_subset'],
|
|
27
|
+
'Main Score': sub_score['main_score'],
|
|
28
|
+
})
|
|
16
29
|
|
|
17
30
|
save_path = os.path.join(
|
|
18
31
|
output_folder,
|
|
19
32
|
model_name,
|
|
20
33
|
revision,
|
|
21
34
|
)
|
|
22
|
-
logger.info(f'Evaluation results:\n{
|
|
35
|
+
logger.info(f'Evaluation results:\n{tabulate(data, headers="keys", tablefmt="grid")}')
|
|
23
36
|
logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
|
|
24
37
|
|
|
25
38
|
|
|
@@ -34,6 +47,7 @@ def one_stage_eval(
|
|
|
34
47
|
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
|
|
35
48
|
evaluation = mteb.MTEB(tasks=tasks)
|
|
36
49
|
|
|
50
|
+
eval_args['encode_kwargs'] = model_args.get('encode_kwargs', {})
|
|
37
51
|
# run evaluation
|
|
38
52
|
results = evaluation.run(model, **eval_args)
|
|
39
53
|
|
|
@@ -66,6 +80,7 @@ def two_stage_eval(
|
|
|
66
80
|
overwrite_results=True,
|
|
67
81
|
hub=eval_args['hub'],
|
|
68
82
|
limits=eval_args['limits'],
|
|
83
|
+
encode_kwargs=model1_args.get('encode_kwargs', {}),
|
|
69
84
|
)
|
|
70
85
|
# stage 2: run cross encoder
|
|
71
86
|
results = evaluation.run(
|
|
@@ -77,6 +92,7 @@ def two_stage_eval(
|
|
|
77
92
|
overwrite_results=True,
|
|
78
93
|
hub=eval_args['hub'],
|
|
79
94
|
limits=eval_args['limits'],
|
|
95
|
+
encode_kwargs=model2_args.get('encode_kwargs', {}),
|
|
80
96
|
)
|
|
81
97
|
|
|
82
98
|
# save and log results
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import torch
|
|
3
3
|
from langchain_core.embeddings import Embeddings
|
|
4
4
|
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
5
|
+
from mteb.encoder_interface import PromptType
|
|
5
6
|
from sentence_transformers import models
|
|
6
7
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
7
8
|
from sentence_transformers.SentenceTransformer import SentenceTransformer
|
|
@@ -12,6 +13,7 @@ from typing import Dict, List, Optional, Union
|
|
|
12
13
|
from evalscope.backend.rag_eval.utils.tools import download_model
|
|
13
14
|
from evalscope.constants import HubType
|
|
14
15
|
from evalscope.utils.logger import get_logger
|
|
16
|
+
from evalscope.utils.utils import get_supported_params
|
|
15
17
|
|
|
16
18
|
logger = get_logger()
|
|
17
19
|
|
|
@@ -22,14 +24,14 @@ class BaseModel(Embeddings):
|
|
|
22
24
|
self,
|
|
23
25
|
model_name_or_path: str = '',
|
|
24
26
|
max_seq_length: int = 512,
|
|
25
|
-
prompt: str =
|
|
27
|
+
prompt: Optional[str] = None,
|
|
28
|
+
prompts: Optional[Dict[str, str]] = None,
|
|
26
29
|
revision: Optional[str] = 'master',
|
|
27
30
|
**kwargs,
|
|
28
31
|
):
|
|
29
32
|
self.model_name_or_path = model_name_or_path
|
|
30
33
|
self.max_seq_length = max_seq_length
|
|
31
34
|
self.model_kwargs = kwargs.pop('model_kwargs', {})
|
|
32
|
-
self.model_kwargs['trust_remote_code'] = True
|
|
33
35
|
|
|
34
36
|
self.config_kwargs = kwargs.pop('config_kwargs', {})
|
|
35
37
|
self.config_kwargs['trust_remote_code'] = True
|
|
@@ -38,7 +40,9 @@ class BaseModel(Embeddings):
|
|
|
38
40
|
self.encode_kwargs['convert_to_tensor'] = True
|
|
39
41
|
|
|
40
42
|
self.prompt = prompt
|
|
43
|
+
self.prompts = prompts if prompts else {}
|
|
41
44
|
self.revision = revision
|
|
45
|
+
self.framework = ['PyTorch']
|
|
42
46
|
|
|
43
47
|
@property
|
|
44
48
|
def mteb_model_meta(self):
|
|
@@ -46,10 +50,22 @@ class BaseModel(Embeddings):
|
|
|
46
50
|
from mteb import ModelMeta
|
|
47
51
|
|
|
48
52
|
return ModelMeta(
|
|
49
|
-
name=os.path.basename(self.model_name_or_path),
|
|
53
|
+
name='eval/' + os.path.basename(self.model_name_or_path), # Ensure the name contains a slash
|
|
50
54
|
revision=self.revision,
|
|
51
55
|
languages=None,
|
|
52
56
|
release_date=None,
|
|
57
|
+
n_parameters=None,
|
|
58
|
+
memory_usage_mb=None,
|
|
59
|
+
max_tokens=None,
|
|
60
|
+
embed_dim=None,
|
|
61
|
+
license=None,
|
|
62
|
+
open_weights=None,
|
|
63
|
+
public_training_code=None,
|
|
64
|
+
public_training_data=None,
|
|
65
|
+
similarity_fn_name=None,
|
|
66
|
+
use_instructions=None,
|
|
67
|
+
training_datasets=None,
|
|
68
|
+
framework=self.framework,
|
|
53
69
|
)
|
|
54
70
|
|
|
55
71
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
@@ -61,7 +77,7 @@ class BaseModel(Embeddings):
|
|
|
61
77
|
Returns:
|
|
62
78
|
List of embeddings.
|
|
63
79
|
"""
|
|
64
|
-
return self.
|
|
80
|
+
return self.encode(texts).tolist()
|
|
65
81
|
|
|
66
82
|
def embed_query(self, text: str) -> List[float]:
|
|
67
83
|
"""Embed query text. Compact langchain.
|
|
@@ -72,19 +88,17 @@ class BaseModel(Embeddings):
|
|
|
72
88
|
Returns:
|
|
73
89
|
Embedding.
|
|
74
90
|
"""
|
|
75
|
-
return self.
|
|
91
|
+
return self.encode(text).tolist()
|
|
76
92
|
|
|
77
93
|
def encode(self, texts: Union[str, List[str]], **kwargs) -> List[List[float]]:
|
|
78
94
|
"""Embed text."""
|
|
79
95
|
raise NotImplementedError
|
|
80
96
|
|
|
81
|
-
def
|
|
82
|
-
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
"""Embed search docs . Compact mteb."""
|
|
87
|
-
raise NotImplementedError
|
|
97
|
+
def get_prompt(self, task_name: str) -> Optional[str]:
|
|
98
|
+
"""Get prompt for the given task name."""
|
|
99
|
+
if self.prompt:
|
|
100
|
+
return self.prompt
|
|
101
|
+
return self.prompts.get(task_name, None)
|
|
88
102
|
|
|
89
103
|
|
|
90
104
|
class SentenceTransformerModel(BaseModel):
|
|
@@ -92,6 +106,9 @@ class SentenceTransformerModel(BaseModel):
|
|
|
92
106
|
def __init__(self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs):
|
|
93
107
|
super().__init__(model_name_or_path, **kwargs)
|
|
94
108
|
|
|
109
|
+
self.framework = ['Sentence Transformers', 'PyTorch']
|
|
110
|
+
|
|
111
|
+
self.model_kwargs['trust_remote_code'] = True
|
|
95
112
|
if not pooling_mode:
|
|
96
113
|
self.model = SentenceTransformer(
|
|
97
114
|
self.model_name_or_path,
|
|
@@ -112,36 +129,47 @@ class SentenceTransformerModel(BaseModel):
|
|
|
112
129
|
|
|
113
130
|
self.model.max_seq_length = self.max_seq_length
|
|
114
131
|
|
|
115
|
-
|
|
116
|
-
|
|
132
|
+
self.supported_encode_params = get_supported_params(self.model.encode)
|
|
133
|
+
|
|
134
|
+
def encode(self, texts: Union[str, List[str]], **kwargs) -> List[torch.Tensor]:
|
|
135
|
+
# pop unused kwargs
|
|
136
|
+
extra_params = {}
|
|
137
|
+
for key in list(kwargs.keys()):
|
|
138
|
+
if key not in self.supported_encode_params:
|
|
139
|
+
extra_params[key] = kwargs.pop(key)
|
|
117
140
|
self.encode_kwargs.update(kwargs)
|
|
118
141
|
|
|
142
|
+
# set prompt if provided
|
|
143
|
+
prompt = None
|
|
144
|
+
prompt_type = extra_params.pop('prompt_type', '')
|
|
145
|
+
task_name = extra_params.pop('task_name', '')
|
|
146
|
+
if prompt_type and prompt_type == PromptType.query:
|
|
147
|
+
prompt = self.get_prompt(task_name)
|
|
148
|
+
|
|
119
149
|
embeddings = self.model.encode(texts, prompt=prompt, **self.encode_kwargs)
|
|
120
150
|
assert isinstance(embeddings, Tensor)
|
|
121
151
|
return embeddings.cpu().detach()
|
|
122
152
|
|
|
123
|
-
def encode_queries(self, queries, **kwargs):
|
|
124
|
-
return self.encode(queries, prompt=self.prompt)
|
|
125
|
-
|
|
126
|
-
def encode_corpus(self, corpus, **kwargs):
|
|
127
|
-
if isinstance(corpus[0], dict):
|
|
128
|
-
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
|
|
129
|
-
else:
|
|
130
|
-
input_texts = corpus
|
|
131
|
-
return self.encode(input_texts)
|
|
132
|
-
|
|
133
153
|
|
|
134
154
|
class CrossEncoderModel(BaseModel):
|
|
135
155
|
|
|
136
156
|
def __init__(self, model_name_or_path: str, **kwargs):
|
|
137
157
|
super().__init__(model_name_or_path, **kwargs)
|
|
158
|
+
|
|
159
|
+
self.framework = ['Sentence Transformers', 'PyTorch']
|
|
160
|
+
|
|
138
161
|
self.model = CrossEncoder(
|
|
139
162
|
self.model_name_or_path,
|
|
140
163
|
trust_remote_code=True,
|
|
141
164
|
max_length=self.max_seq_length,
|
|
165
|
+
automodel_args=self.model_kwargs,
|
|
142
166
|
)
|
|
167
|
+
self.supported_encode_params = get_supported_params(self.model.predict)
|
|
143
168
|
|
|
144
169
|
def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
|
|
170
|
+
for key in list(kwargs.keys()):
|
|
171
|
+
if key not in self.supported_encode_params:
|
|
172
|
+
kwargs.pop(key)
|
|
145
173
|
self.encode_kwargs.update(kwargs)
|
|
146
174
|
|
|
147
175
|
if len(sentences[0]) == 3: # Note: For mteb retrieval task
|
|
@@ -163,6 +191,7 @@ class APIEmbeddingModel(BaseModel):
|
|
|
163
191
|
self.openai_api_base = kwargs.get('api_base')
|
|
164
192
|
self.openai_api_key = kwargs.get('api_key')
|
|
165
193
|
self.dimensions = kwargs.get('dimensions')
|
|
194
|
+
self.framework = ['API']
|
|
166
195
|
|
|
167
196
|
self.model = OpenAIEmbeddings(
|
|
168
197
|
model=self.model_name,
|
|
@@ -175,26 +204,37 @@ class APIEmbeddingModel(BaseModel):
|
|
|
175
204
|
|
|
176
205
|
self.batch_size = self.encode_kwargs.get('batch_size', 10)
|
|
177
206
|
|
|
207
|
+
self.supported_encode_params = get_supported_params(self.model.embed_documents)
|
|
208
|
+
|
|
178
209
|
def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
|
|
210
|
+
# pop unused kwargs
|
|
211
|
+
extra_params = {}
|
|
212
|
+
for key in list(kwargs.keys()):
|
|
213
|
+
if key not in self.supported_encode_params:
|
|
214
|
+
extra_params[key] = kwargs.pop(key)
|
|
215
|
+
self.encode_kwargs.update(kwargs)
|
|
216
|
+
|
|
217
|
+
# set prompt if provided
|
|
218
|
+
prompt = None
|
|
219
|
+
prompt_type = extra_params.pop('prompt_type', '')
|
|
220
|
+
task_name = extra_params.pop('task_name', '')
|
|
221
|
+
if prompt_type and prompt_type == PromptType.query:
|
|
222
|
+
prompt = self.get_prompt(task_name)
|
|
223
|
+
|
|
179
224
|
if isinstance(texts, str):
|
|
180
225
|
texts = [texts]
|
|
181
226
|
|
|
182
227
|
embeddings: List[List[float]] = []
|
|
183
228
|
for i in tqdm(range(0, len(texts), self.batch_size)):
|
|
184
|
-
|
|
229
|
+
# set prompt if provided
|
|
230
|
+
if prompt is not None:
|
|
231
|
+
batch_texts = [prompt + text for text in texts[i:i + self.batch_size]]
|
|
232
|
+
else:
|
|
233
|
+
batch_texts = texts[i:i + self.batch_size]
|
|
234
|
+
response = self.model.embed_documents(batch_texts, chunk_size=self.batch_size)
|
|
185
235
|
embeddings.extend(response)
|
|
186
236
|
return torch.tensor(embeddings)
|
|
187
237
|
|
|
188
|
-
def encode_queries(self, queries, **kwargs):
|
|
189
|
-
return self.encode(queries, **kwargs)
|
|
190
|
-
|
|
191
|
-
def encode_corpus(self, corpus, **kwargs):
|
|
192
|
-
if isinstance(corpus[0], dict):
|
|
193
|
-
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
|
|
194
|
-
else:
|
|
195
|
-
input_texts = corpus
|
|
196
|
-
return self.encode(input_texts, **kwargs)
|
|
197
|
-
|
|
198
238
|
|
|
199
239
|
class EmbeddingModel:
|
|
200
240
|
"""Custom embeddings"""
|
|
@@ -28,6 +28,7 @@ class BenchmarkMeta:
|
|
|
28
28
|
system_prompt: Optional[str] = None
|
|
29
29
|
query_template: Optional[str] = None
|
|
30
30
|
pretty_name: Optional[str] = None
|
|
31
|
+
description: Optional[str] = None
|
|
31
32
|
filters: Optional[OrderedDict] = None
|
|
32
33
|
extra_params: Optional[Dict] = field(default_factory=dict)
|
|
33
34
|
|