evalscope 0.13.1__tar.gz → 0.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.13.1/evalscope.egg-info → evalscope-0.14.0}/PKG-INFO +21 -55
- {evalscope-0.13.1 → evalscope-0.14.0}/README.md +10 -4
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/arguments.py +1 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/__init__.py +1 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/backend_manager.py +21 -5
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/embedding.py +49 -3
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/llm.py +8 -9
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope-0.14.0/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope-0.14.0/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope-0.14.0/evalscope/benchmarks/arena_hard/utils.py +162 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/data_adapter.py +30 -2
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
- evalscope-0.14.0/evalscope/benchmarks/live_code_bench/testing_util.py +537 -0
- evalscope-0.14.0/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope-0.14.0/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/musr/musr_adapter.py +1 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/collections/evaluator.py +4 -2
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/config.py +2 -2
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/llm_judge.py +1 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/chat_adapter.py +32 -11
- evalscope-0.14.0/evalscope/perf/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/arguments.py +30 -9
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/benchmark.py +57 -103
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/http_client.py +2 -3
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/custom_api.py +1 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/openai_api.py +4 -2
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/custom.py +4 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/openqa.py +4 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope-0.14.0/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/benchmark_util.py +12 -6
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/db_util.py +3 -3
- evalscope-0.14.0/evalscope/perf/utils/log_utils.py +41 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/app.py +11 -11
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/run.py +7 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/summarizer.py +2 -1
- evalscope-0.14.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/utils.py +36 -25
- evalscope-0.14.0/evalscope/version.py +4 -0
- {evalscope-0.13.1 → evalscope-0.14.0/evalscope.egg-info}/PKG-INFO +21 -55
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/SOURCES.txt +10 -4
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/requires.txt +10 -51
- {evalscope-0.13.1 → evalscope-0.14.0}/requirements/framework.txt +0 -12
- evalscope-0.14.0/requirements/rag.txt +7 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/setup.py +0 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/cli/test_all.py +36 -27
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/cli/test_collection.py +2 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/cli/test_run.py +38 -20
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/perf/test_perf.py +1 -2
- evalscope-0.14.0/tests/rag/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/rag/test_clip_benchmark.py +0 -1
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/rag/test_mteb.py +37 -8
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/rag/test_ragas.py +33 -27
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/vlm/test_vlmeval.py +37 -1
- evalscope-0.13.1/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope-0.13.1/evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- evalscope-0.13.1/evalscope/benchmarks/live_code_bench/testing_util.py +0 -721
- evalscope-0.13.1/evalscope/version.py +0 -4
- evalscope-0.13.1/requirements/inner.txt +0 -25
- evalscope-0.13.1/requirements/rag.txt +0 -3
- evalscope-0.13.1/requirements/tests.txt +0 -5
- {evalscope-0.13.1 → evalscope-0.14.0}/LICENSE +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/MANIFEST.in +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/base.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/chinese_simple_qa → evalscope-0.14.0/evalscope/benchmarks/alpaca_eval}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/data_collection → evalscope-0.14.0/evalscope/benchmarks/arena_hard}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/general_mcq → evalscope-0.14.0/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/gpqa → evalscope-0.14.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/ifeval → evalscope-0.14.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/iquiz → evalscope-0.14.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/live_code_bench → evalscope-0.14.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/math_500 → evalscope-0.14.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/mmlu_pro → evalscope-0.14.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/musr → evalscope-0.14.0/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/process_bench → evalscope-0.14.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/simple_qa → evalscope-0.14.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.13.1/evalscope/benchmarks/super_gpqa → evalscope-0.14.0/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.13.1/evalscope/perf → evalscope-0.14.0/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.13.1/evalscope/perf/utils → evalscope-0.14.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.13.1/evalscope/third_party/thinkbench/tools → evalscope-0.14.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.13.1/tests/rag → evalscope-0.14.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/benchmarks/utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/base.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/cli.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/collections/schema.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/constants.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/evaluator.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/math_parser.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/base_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/choice_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/custom_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/local_model.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/model.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/register.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/models/server_adapter.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/main.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/combinator.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/generator.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/report/utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/run_arena.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/filters.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/io_utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/logger.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/requirements/app.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/requirements/docs.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/requirements/opencompass.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/requirements/perf.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/requirements/vlmeval.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/requirements.txt +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/setup.cfg +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/cli/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/perf/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/swift/__init__.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/test_run_all.py +0 -0
- {evalscope-0.13.1 → evalscope-0.14.0}/tests/vlm/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
16
16
|
Requires-Python: >=3.8
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
-
Requires-Dist: absl-py
|
|
20
19
|
Requires-Dist: accelerate
|
|
21
|
-
Requires-Dist: cachetools
|
|
22
20
|
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
|
-
Requires-Dist: editdistance
|
|
24
21
|
Requires-Dist: immutabledict
|
|
25
22
|
Requires-Dist: jieba
|
|
26
23
|
Requires-Dist: jsonlines
|
|
@@ -31,35 +28,31 @@ Requires-Dist: modelscope[framework]
|
|
|
31
28
|
Requires-Dist: nltk>=3.9
|
|
32
29
|
Requires-Dist: openai
|
|
33
30
|
Requires-Dist: pandas
|
|
34
|
-
Requires-Dist: plotly
|
|
35
31
|
Requires-Dist: pyarrow
|
|
36
|
-
Requires-Dist: pympler
|
|
37
32
|
Requires-Dist: pyyaml
|
|
38
|
-
Requires-Dist: regex
|
|
39
33
|
Requires-Dist: requests
|
|
40
|
-
Requires-Dist: requests-toolbelt
|
|
41
34
|
Requires-Dist: rouge-chinese
|
|
42
35
|
Requires-Dist: rouge-score>=0.1.0
|
|
43
36
|
Requires-Dist: sacrebleu
|
|
44
37
|
Requires-Dist: scikit-learn
|
|
45
38
|
Requires-Dist: seaborn
|
|
46
|
-
Requires-Dist: sentencepiece
|
|
47
|
-
Requires-Dist: simple-ddl-parser
|
|
48
39
|
Requires-Dist: sympy
|
|
49
40
|
Requires-Dist: tabulate
|
|
50
|
-
Requires-Dist: tiktoken
|
|
51
41
|
Requires-Dist: torch
|
|
52
42
|
Requires-Dist: tqdm
|
|
53
43
|
Requires-Dist: transformers>=4.33
|
|
54
|
-
Requires-Dist: transformers_stream_generator
|
|
55
44
|
Requires-Dist: word2number
|
|
56
45
|
Provides-Extra: opencompass
|
|
57
46
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
|
|
58
47
|
Provides-Extra: vlmeval
|
|
59
48
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
60
49
|
Provides-Extra: rag
|
|
50
|
+
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
51
|
+
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
52
|
+
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
53
|
+
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
61
54
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
62
|
-
Requires-Dist: ragas==0.2.
|
|
55
|
+
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
63
56
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
64
57
|
Provides-Extra: perf
|
|
65
58
|
Requires-Dist: aiohttp; extra == "perf"
|
|
@@ -71,38 +64,9 @@ Requires-Dist: unicorn; extra == "perf"
|
|
|
71
64
|
Provides-Extra: app
|
|
72
65
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
73
66
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
74
|
-
Provides-Extra: inner
|
|
75
|
-
Requires-Dist: absl-py; extra == "inner"
|
|
76
|
-
Requires-Dist: accelerate; extra == "inner"
|
|
77
|
-
Requires-Dist: alibaba_itag_sdk; extra == "inner"
|
|
78
|
-
Requires-Dist: dashscope; extra == "inner"
|
|
79
|
-
Requires-Dist: editdistance; extra == "inner"
|
|
80
|
-
Requires-Dist: jsonlines; extra == "inner"
|
|
81
|
-
Requires-Dist: nltk; extra == "inner"
|
|
82
|
-
Requires-Dist: openai; extra == "inner"
|
|
83
|
-
Requires-Dist: pandas==1.5.3; extra == "inner"
|
|
84
|
-
Requires-Dist: plotly; extra == "inner"
|
|
85
|
-
Requires-Dist: pyarrow; extra == "inner"
|
|
86
|
-
Requires-Dist: pyodps; extra == "inner"
|
|
87
|
-
Requires-Dist: pyyaml; extra == "inner"
|
|
88
|
-
Requires-Dist: regex; extra == "inner"
|
|
89
|
-
Requires-Dist: requests==2.28.1; extra == "inner"
|
|
90
|
-
Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
|
|
91
|
-
Requires-Dist: rouge-score; extra == "inner"
|
|
92
|
-
Requires-Dist: sacrebleu; extra == "inner"
|
|
93
|
-
Requires-Dist: scikit-learn; extra == "inner"
|
|
94
|
-
Requires-Dist: seaborn; extra == "inner"
|
|
95
|
-
Requires-Dist: simple-ddl-parser; extra == "inner"
|
|
96
|
-
Requires-Dist: streamlit; extra == "inner"
|
|
97
|
-
Requires-Dist: tqdm; extra == "inner"
|
|
98
|
-
Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
|
|
99
|
-
Requires-Dist: transformers_stream_generator; extra == "inner"
|
|
100
67
|
Provides-Extra: all
|
|
101
|
-
Requires-Dist: absl-py; extra == "all"
|
|
102
68
|
Requires-Dist: accelerate; extra == "all"
|
|
103
|
-
Requires-Dist: cachetools; extra == "all"
|
|
104
69
|
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
105
|
-
Requires-Dist: editdistance; extra == "all"
|
|
106
70
|
Requires-Dist: immutabledict; extra == "all"
|
|
107
71
|
Requires-Dist: jieba; extra == "all"
|
|
108
72
|
Requires-Dist: jsonlines; extra == "all"
|
|
@@ -113,32 +77,28 @@ Requires-Dist: modelscope[framework]; extra == "all"
|
|
|
113
77
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
114
78
|
Requires-Dist: openai; extra == "all"
|
|
115
79
|
Requires-Dist: pandas; extra == "all"
|
|
116
|
-
Requires-Dist: plotly; extra == "all"
|
|
117
80
|
Requires-Dist: pyarrow; extra == "all"
|
|
118
|
-
Requires-Dist: pympler; extra == "all"
|
|
119
81
|
Requires-Dist: pyyaml; extra == "all"
|
|
120
|
-
Requires-Dist: regex; extra == "all"
|
|
121
82
|
Requires-Dist: requests; extra == "all"
|
|
122
|
-
Requires-Dist: requests-toolbelt; extra == "all"
|
|
123
83
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
124
84
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
125
85
|
Requires-Dist: sacrebleu; extra == "all"
|
|
126
86
|
Requires-Dist: scikit-learn; extra == "all"
|
|
127
87
|
Requires-Dist: seaborn; extra == "all"
|
|
128
|
-
Requires-Dist: sentencepiece; extra == "all"
|
|
129
|
-
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
130
88
|
Requires-Dist: sympy; extra == "all"
|
|
131
89
|
Requires-Dist: tabulate; extra == "all"
|
|
132
|
-
Requires-Dist: tiktoken; extra == "all"
|
|
133
90
|
Requires-Dist: torch; extra == "all"
|
|
134
91
|
Requires-Dist: tqdm; extra == "all"
|
|
135
92
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
136
|
-
Requires-Dist: transformers_stream_generator; extra == "all"
|
|
137
93
|
Requires-Dist: word2number; extra == "all"
|
|
138
94
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
139
95
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
96
|
+
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
97
|
+
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
98
|
+
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
99
|
+
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
140
100
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
141
|
-
Requires-Dist: ragas==0.2.
|
|
101
|
+
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
142
102
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
143
103
|
Requires-Dist: aiohttp; extra == "all"
|
|
144
104
|
Requires-Dist: fastapi; extra == "all"
|
|
@@ -161,7 +121,7 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
|
161
121
|
</p>
|
|
162
122
|
|
|
163
123
|
<p align="center">
|
|
164
|
-
<img src="https://img.shields.io/badge/python-%E2%89%A53.
|
|
124
|
+
<img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
|
|
165
125
|
<a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
|
|
166
126
|
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
|
|
167
127
|
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
@@ -239,6 +199,9 @@ Please scan the QR code below to join our community groups:
|
|
|
239
199
|
|
|
240
200
|
## 🎉 News
|
|
241
201
|
|
|
202
|
+
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
203
|
+
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
204
|
+
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
242
205
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
243
206
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
244
207
|
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
@@ -251,15 +214,14 @@ Please scan the QR code below to join our community groups:
|
|
|
251
214
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
252
215
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
253
216
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
217
|
+
<details><summary>More</summary>
|
|
218
|
+
|
|
254
219
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
255
220
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
256
221
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
257
222
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
258
223
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
259
224
|
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
260
|
-
|
|
261
|
-
<details><summary>More</summary>
|
|
262
|
-
|
|
263
225
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
264
226
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
265
227
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -542,6 +504,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
|
|
|
542
504
|
|
|
543
505
|

|
|
544
506
|
|
|
507
|
+
**Supports swanlab for recording results**
|
|
508
|
+
|
|
509
|
+

|
|
510
|
+
|
|
545
511
|
**Supports Speed Benchmark**
|
|
546
512
|
|
|
547
513
|
It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
</p>
|
|
11
11
|
|
|
12
12
|
<p align="center">
|
|
13
|
-
<img src="https://img.shields.io/badge/python-%E2%89%A53.
|
|
13
|
+
<img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
|
|
14
14
|
<a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
|
|
15
15
|
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
|
|
16
16
|
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
@@ -88,6 +88,9 @@ Please scan the QR code below to join our community groups:
|
|
|
88
88
|
|
|
89
89
|
## 🎉 News
|
|
90
90
|
|
|
91
|
+
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
92
|
+
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
93
|
+
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
91
94
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
92
95
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
93
96
|
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
@@ -100,15 +103,14 @@ Please scan the QR code below to join our community groups:
|
|
|
100
103
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
101
104
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
102
105
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
106
|
+
<details><summary>More</summary>
|
|
107
|
+
|
|
103
108
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
104
109
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
105
110
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
106
111
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
107
112
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
108
113
|
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
109
|
-
|
|
110
|
-
<details><summary>More</summary>
|
|
111
|
-
|
|
112
114
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
113
115
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
114
116
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -391,6 +393,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
|
|
|
391
393
|
|
|
392
394
|

|
|
393
395
|
|
|
396
|
+
**Supports swanlab for recording results**
|
|
397
|
+
|
|
398
|
+

|
|
399
|
+
|
|
394
400
|
**Supports Speed Benchmark**
|
|
395
401
|
|
|
396
402
|
It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
|
|
@@ -77,7 +77,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
77
77
|
# LLMJudge arguments
|
|
78
78
|
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
79
79
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
80
|
-
parser.add_argument('--judge-worker-num', type=int, default=
|
|
80
|
+
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
81
81
|
# yapf: enable
|
|
82
82
|
|
|
83
83
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
|
|
1
|
+
from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager, Tools
|
|
2
2
|
from evalscope.backend.rag_eval.utils.clip import VisionModel
|
|
3
3
|
from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
|
|
4
4
|
from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
|
|
@@ -8,6 +8,12 @@ from evalscope.utils.logger import get_logger
|
|
|
8
8
|
logger = get_logger()
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
class Tools:
|
|
12
|
+
MTEB = 'mteb'
|
|
13
|
+
RAGAS = 'ragas'
|
|
14
|
+
CLIP_BENCHMARK = 'clip_benchmark'
|
|
15
|
+
|
|
16
|
+
|
|
11
17
|
class RAGEvalBackendManager(BackendManager):
|
|
12
18
|
|
|
13
19
|
def __init__(self, config: Union[str, dict], **kwargs):
|
|
@@ -47,9 +53,19 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
47
53
|
from evalscope.backend.rag_eval.ragas.tasks import generate_testset
|
|
48
54
|
|
|
49
55
|
if testset_args is not None:
|
|
50
|
-
|
|
56
|
+
if isinstance(testset_args, dict):
|
|
57
|
+
generate_testset(TestsetGenerationArguments(**testset_args))
|
|
58
|
+
elif isinstance(testset_args, TestsetGenerationArguments):
|
|
59
|
+
generate_testset(testset_args)
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError('Please provide the testset generation arguments.')
|
|
51
62
|
if eval_args is not None:
|
|
52
|
-
|
|
63
|
+
if isinstance(eval_args, dict):
|
|
64
|
+
rag_eval(EvaluationArguments(**eval_args))
|
|
65
|
+
elif isinstance(eval_args, EvaluationArguments):
|
|
66
|
+
rag_eval(eval_args)
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError('Please provide the evaluation arguments.')
|
|
53
69
|
|
|
54
70
|
@staticmethod
|
|
55
71
|
def run_clip_benchmark(args):
|
|
@@ -59,17 +75,17 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
59
75
|
|
|
60
76
|
def run(self, *args, **kwargs):
|
|
61
77
|
tool = self.config_d.pop('tool')
|
|
62
|
-
if tool.lower() ==
|
|
78
|
+
if tool.lower() == Tools.MTEB:
|
|
63
79
|
self._check_env('mteb')
|
|
64
80
|
model_args = self.config_d['model']
|
|
65
81
|
eval_args = self.config_d['eval']
|
|
66
82
|
self.run_mteb(model_args, eval_args)
|
|
67
|
-
elif tool.lower() ==
|
|
83
|
+
elif tool.lower() == Tools.RAGAS:
|
|
68
84
|
self._check_env('ragas')
|
|
69
85
|
testset_args = self.config_d.get('testset_generation', None)
|
|
70
86
|
eval_args = self.config_d.get('eval', None)
|
|
71
87
|
self.run_ragas(testset_args, eval_args)
|
|
72
|
-
elif tool.lower() ==
|
|
88
|
+
elif tool.lower() == Tools.CLIP_BENCHMARK:
|
|
73
89
|
self._check_env('webdataset')
|
|
74
90
|
self.run_clip_benchmark(self.config_d['eval'])
|
|
75
91
|
else:
|
|
@@ -20,6 +20,12 @@ class ModelArguments:
|
|
|
20
20
|
encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
|
|
21
21
|
hub: str = 'modelscope' # modelscope or huggingface
|
|
22
22
|
|
|
23
|
+
# for API embedding model
|
|
24
|
+
model_name: Optional[str] = None
|
|
25
|
+
api_base: Optional[str] = None
|
|
26
|
+
api_key: Optional[str] = None
|
|
27
|
+
dimensions: Optional[int] = None
|
|
28
|
+
|
|
23
29
|
def to_dict(self) -> Dict[str, Any]:
|
|
24
30
|
return {
|
|
25
31
|
'model_name_or_path': self.model_name_or_path,
|
|
@@ -31,6 +37,10 @@ class ModelArguments:
|
|
|
31
37
|
'config_kwargs': self.config_kwargs,
|
|
32
38
|
'encode_kwargs': self.encode_kwargs,
|
|
33
39
|
'hub': self.hub,
|
|
40
|
+
'model_name': self.model_name,
|
|
41
|
+
'api_base': self.api_base,
|
|
42
|
+
'api_key': self.api_key,
|
|
43
|
+
'dimensions': self.dimensions,
|
|
34
44
|
}
|
|
35
45
|
|
|
36
46
|
|
|
@@ -21,7 +21,6 @@ class TestsetGenerationArguments:
|
|
|
21
21
|
"""
|
|
22
22
|
generator_llm: Dict = field(default_factory=dict)
|
|
23
23
|
embeddings: Dict = field(default_factory=dict)
|
|
24
|
-
distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
|
|
25
24
|
# For LLM based evaluation
|
|
26
25
|
# available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
|
|
27
26
|
# 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
|
{evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py
RENAMED
|
@@ -67,9 +67,14 @@ def get_persona(llm, kg, language):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
def load_data(file_path):
|
|
70
|
-
|
|
70
|
+
import nltk
|
|
71
|
+
from langchain_unstructured import UnstructuredLoader
|
|
71
72
|
|
|
72
|
-
|
|
73
|
+
if nltk.data.find('taggers/averaged_perceptron_tagger_eng') is False:
|
|
74
|
+
# need to download nltk data for the first time
|
|
75
|
+
nltk.download('averaged_perceptron_tagger_eng')
|
|
76
|
+
|
|
77
|
+
loader = UnstructuredLoader(file_path)
|
|
73
78
|
data = loader.load()
|
|
74
79
|
return data
|
|
75
80
|
|
{evalscope-0.13.1 → evalscope-0.14.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py
RENAMED
|
@@ -2,7 +2,6 @@ import asyncio
|
|
|
2
2
|
import os
|
|
3
3
|
from ragas.llms import BaseRagasLLM
|
|
4
4
|
from ragas.prompt import PromptMixin, PydanticPrompt
|
|
5
|
-
from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
|
|
6
5
|
from typing import List
|
|
7
6
|
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
@@ -16,10 +15,6 @@ async def translate_prompt(
|
|
|
16
15
|
llm: BaseRagasLLM,
|
|
17
16
|
adapt_instruction: bool = False,
|
|
18
17
|
):
|
|
19
|
-
if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
|
|
20
|
-
logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
|
|
21
|
-
return
|
|
22
|
-
|
|
23
18
|
if not issubclass(type(prompt_user), PromptMixin):
|
|
24
19
|
logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
|
|
25
20
|
return
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import torch
|
|
3
3
|
from langchain_core.embeddings import Embeddings
|
|
4
|
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
4
5
|
from sentence_transformers import models
|
|
5
6
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
6
7
|
from sentence_transformers.SentenceTransformer import SentenceTransformer
|
|
7
8
|
from torch import Tensor
|
|
9
|
+
from tqdm import tqdm
|
|
8
10
|
from typing import Dict, List, Optional, Union
|
|
9
11
|
|
|
10
12
|
from evalscope.backend.rag_eval.utils.tools import download_model
|
|
@@ -18,10 +20,10 @@ class BaseModel(Embeddings):
|
|
|
18
20
|
|
|
19
21
|
def __init__(
|
|
20
22
|
self,
|
|
21
|
-
model_name_or_path: str,
|
|
23
|
+
model_name_or_path: str = '',
|
|
22
24
|
max_seq_length: int = 512,
|
|
23
25
|
prompt: str = '',
|
|
24
|
-
revision: Optional[str] =
|
|
26
|
+
revision: Optional[str] = 'master',
|
|
25
27
|
**kwargs,
|
|
26
28
|
):
|
|
27
29
|
self.model_name_or_path = model_name_or_path
|
|
@@ -139,7 +141,7 @@ class CrossEncoderModel(BaseModel):
|
|
|
139
141
|
max_length=self.max_seq_length,
|
|
140
142
|
)
|
|
141
143
|
|
|
142
|
-
def predict(self, sentences: List[List[str]], **kwargs) ->
|
|
144
|
+
def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
|
|
143
145
|
self.encode_kwargs.update(kwargs)
|
|
144
146
|
|
|
145
147
|
if len(sentences[0]) == 3: # Note: For mteb retrieval task
|
|
@@ -154,6 +156,46 @@ class CrossEncoderModel(BaseModel):
|
|
|
154
156
|
return embeddings
|
|
155
157
|
|
|
156
158
|
|
|
159
|
+
class APIEmbeddingModel(BaseModel):
|
|
160
|
+
|
|
161
|
+
def __init__(self, **kwargs):
|
|
162
|
+
self.model_name = kwargs.get('model_name')
|
|
163
|
+
self.openai_api_base = kwargs.get('api_base')
|
|
164
|
+
self.openai_api_key = kwargs.get('api_key')
|
|
165
|
+
self.dimensions = kwargs.get('dimensions')
|
|
166
|
+
|
|
167
|
+
self.model = OpenAIEmbeddings(
|
|
168
|
+
model=self.model_name,
|
|
169
|
+
openai_api_base=self.openai_api_base,
|
|
170
|
+
openai_api_key=self.openai_api_key,
|
|
171
|
+
dimensions=self.dimensions,
|
|
172
|
+
check_embedding_ctx_length=False)
|
|
173
|
+
|
|
174
|
+
super().__init__(model_name_or_path=self.model_name, **kwargs)
|
|
175
|
+
|
|
176
|
+
self.batch_size = self.encode_kwargs.get('batch_size', 10)
|
|
177
|
+
|
|
178
|
+
def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
|
|
179
|
+
if isinstance(texts, str):
|
|
180
|
+
texts = [texts]
|
|
181
|
+
|
|
182
|
+
embeddings: List[List[float]] = []
|
|
183
|
+
for i in tqdm(range(0, len(texts), self.batch_size)):
|
|
184
|
+
response = self.model.embed_documents(texts[i:i + self.batch_size], chunk_size=self.batch_size)
|
|
185
|
+
embeddings.extend(response)
|
|
186
|
+
return torch.tensor(embeddings)
|
|
187
|
+
|
|
188
|
+
def encode_queries(self, queries, **kwargs):
|
|
189
|
+
return self.encode(queries, **kwargs)
|
|
190
|
+
|
|
191
|
+
def encode_corpus(self, corpus, **kwargs):
|
|
192
|
+
if isinstance(corpus[0], dict):
|
|
193
|
+
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
|
|
194
|
+
else:
|
|
195
|
+
input_texts = corpus
|
|
196
|
+
return self.encode(input_texts, **kwargs)
|
|
197
|
+
|
|
198
|
+
|
|
157
199
|
class EmbeddingModel:
|
|
158
200
|
"""Custom embeddings"""
|
|
159
201
|
|
|
@@ -165,6 +207,10 @@ class EmbeddingModel:
|
|
|
165
207
|
revision: Optional[str] = 'master',
|
|
166
208
|
**kwargs,
|
|
167
209
|
):
|
|
210
|
+
if kwargs.get('model_name'):
|
|
211
|
+
# If model_name is provided, use OpenAIEmbeddings
|
|
212
|
+
return APIEmbeddingModel(**kwargs)
|
|
213
|
+
|
|
168
214
|
# If model path does not exist and hub is 'modelscope', download the model
|
|
169
215
|
if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
|
|
170
216
|
model_name_or_path = download_model(model_name_or_path, revision)
|
|
@@ -2,11 +2,11 @@ import os
|
|
|
2
2
|
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
|
3
3
|
from langchain_core.language_models.llms import LLM as BaseLLM
|
|
4
4
|
from langchain_openai import ChatOpenAI
|
|
5
|
-
from
|
|
5
|
+
from transformers.generation.configuration_utils import GenerationConfig
|
|
6
6
|
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
7
7
|
|
|
8
8
|
from evalscope.constants import DEFAULT_MODEL_REVISION
|
|
9
|
-
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
from evalscope.models import ChatGenerationModelAdapter, LocalModel
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class LLM:
|
|
@@ -16,9 +16,9 @@ class LLM:
|
|
|
16
16
|
api_base = kw.get('api_base', None)
|
|
17
17
|
if api_base:
|
|
18
18
|
return ChatOpenAI(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
model=kw.get('model_name', ''),
|
|
20
|
+
base_url=api_base,
|
|
21
|
+
api_key=kw.get('api_key', 'EMPTY'),
|
|
22
22
|
)
|
|
23
23
|
else:
|
|
24
24
|
return LocalLLM(**kw)
|
|
@@ -38,8 +38,7 @@ class LocalLLM(BaseLLM):
|
|
|
38
38
|
super().__init__(**kw)
|
|
39
39
|
self.model_name = os.path.basename(self.model_name_or_path)
|
|
40
40
|
self.model = ChatGenerationModelAdapter(
|
|
41
|
-
model_id=self.model_name_or_path,
|
|
42
|
-
model_revision=self.model_revision,
|
|
41
|
+
model=LocalModel(model_id=self.model_name_or_path, model_revision=self.model_revision),
|
|
43
42
|
generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
|
|
44
43
|
)
|
|
45
44
|
|
|
@@ -53,8 +52,8 @@ class LocalLLM(BaseLLM):
|
|
|
53
52
|
"""Run the LLM on the given input."""
|
|
54
53
|
infer_cfg = {'stop': stop}
|
|
55
54
|
|
|
56
|
-
response = self.model._model_generate(prompt, infer_cfg)
|
|
57
|
-
return response
|
|
55
|
+
response, _ = self.model._model_generate([prompt], infer_cfg=infer_cfg)
|
|
56
|
+
return response[0][0]
|
|
58
57
|
|
|
59
58
|
@property
|
|
60
59
|
def _identifying_params(self) -> Dict[str, Any]:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import os
|
|
2
3
|
import subprocess
|
|
3
4
|
from functools import partial
|
|
4
5
|
from typing import Optional, Union
|
|
@@ -66,8 +67,9 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
66
67
|
del remain_cfg['name'] # remove not used args
|
|
67
68
|
del remain_cfg['type'] # remove not used args
|
|
68
69
|
|
|
69
|
-
|
|
70
|
-
|
|
70
|
+
norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
|
|
71
|
+
self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
|
|
72
|
+
new_model_names.append(norm_model_type)
|
|
71
73
|
else:
|
|
72
74
|
remain_cfg = copy.deepcopy(model_cfg)
|
|
73
75
|
del remain_cfg['name'] # remove not used args
|